diff --git a/.claude/settings.json b/.claude/settings.json
index ed78662..d77278a 100644
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -96,41 +96,57 @@
],
"PreToolUse": [
{
+ "matcher": "Bash|Write|Edit",
"hooks": [
{
"type": "command",
"command": "python3 \"$HOME/.claude/hooks/pretool-unified-gate.py\"",
"description": "Unified gate: gitignore-bypass, git-submission, dangerous-command, creation-gate, sensitive-file (ADR-068)",
"timeout": 3000
- },
+ }
+ ]
+ },
+ {
+ "matcher": "Bash",
+ "hooks": [
{
"type": "command",
- "command": "python3 \"$HOME/.claude/hooks/pretool-synthesis-gate.py\"",
- "description": "Consultation synthesis gate: blocks implementation when ADR consultation is incomplete",
+ "command": "python3 \"$HOME/.claude/hooks/pretool-branch-safety.py\"",
+ "description": "Branch safety: blocks git commit on main/master, forces feature branches",
"timeout": 3000
},
{
"type": "command",
- "command": "python3 \"$HOME/.claude/hooks/pretool-branch-safety.py\"",
- "description": "Branch safety: blocks git commit on main/master, forces feature branches",
+ "command": "python3 \"$HOME/.claude/hooks/ci-merge-gate.py\"",
+ "description": "Gate: block merge to main/master when CI checks are red",
"timeout": 3000
- },
+ }
+ ]
+ },
+ {
+ "matcher": "Bash|Edit",
+ "hooks": [
{
"type": "command",
- "command": "python3 \"$HOME/.claude/hooks/pretool-plan-gate.py\"",
- "description": "Plan gate: blocks implementation code without task_plan.md",
+ "command": "python3 \"$HOME/.claude/hooks/pretool-learning-injector.py\"",
+ "description": "Inject known error patterns before Bash/Edit tools run",
"timeout": 3000
- },
+ }
+ ]
+ },
+ {
+ "matcher": "Write|Edit",
+ "hooks": [
{
"type": "command",
- "command": "python3 \"$HOME/.claude/hooks/pretool-adr-creation-gate.py\"",
- "description": "ADR creation gate: blocks new components without an ADR in adr/",
+ "command": "python3 \"$HOME/.claude/hooks/pretool-synthesis-gate.py\"",
+ "description": "Consultation synthesis gate: blocks implementation when ADR consultation is incomplete",
"timeout": 3000
},
{
"type": "command",
- "command": "python3 \"$HOME/.claude/hooks/pretool-learning-injector.py\"",
- "description": "Inject known error patterns before Bash/Edit tools run",
+ "command": "python3 \"$HOME/.claude/hooks/pretool-plan-gate.py\"",
+ "description": "Plan gate: blocks implementation code without task_plan.md",
"timeout": 3000
},
{
@@ -138,39 +154,51 @@
"command": "python3 \"$HOME/.claude/hooks/pretool-prompt-injection-scanner.py\"",
"description": "Advisory scan for prompt injection patterns in agent context files (ADR-070)",
"timeout": 3000
- },
+ }
+ ]
+ },
+ {
+ "matcher": "Write",
+ "hooks": [
{
"type": "command",
- "command": "python3 \"$HOME/.claude/hooks/pretool-subagent-warmstart.py\"",
- "description": "Inject parent session context into subagent prompts (ADR-088)",
- "timeout": 5000
- },
+ "command": "python3 \"$HOME/.claude/hooks/pretool-adr-creation-gate.py\"",
+ "description": "ADR creation gate: blocks new components without an ADR in adr/",
+ "timeout": 3000
+ }
+ ]
+ },
+ {
+ "matcher": "Edit",
+ "hooks": [
{
"type": "command",
"command": "python3 \"$HOME/.claude/hooks/pretool-file-backup.py\"",
"description": "Backup files before Edit tool modifies them",
"timeout": 3000
- },
+ }
+ ]
+ },
+ {
+ "matcher": "Agent",
+ "hooks": [
{
"type": "command",
- "command": "python3 \"$HOME/.claude/hooks/ci-merge-gate.py\"",
- "description": "Gate: block merge to main/master when CI checks are red",
- "timeout": 3000
+ "command": "python3 \"$HOME/.claude/hooks/pretool-subagent-warmstart.py\"",
+ "description": "Inject parent session context into subagent prompts (ADR-088)",
+ "timeout": 5000
}
]
}
],
"PostToolUse": [
{
+ "matcher": "Write|Edit",
"hooks": [
{
"type": "command",
- "command": "python3 \"$HOME/.claude/hooks/post-tool-lint-hint.py\""
- },
- {
- "type": "command",
- "command": "python3 \"$HOME/.claude/hooks/error-learner.py\"",
- "description": "Learn from tool errors and suggest solutions"
+ "command": "python3 \"$HOME/.claude/hooks/post-tool-lint-hint.py\"",
+ "description": "Gentle lint reminder after file modifications"
},
{
"type": "command",
@@ -185,48 +213,82 @@
},
{
"type": "command",
- "command": "python3 \"$HOME/.claude/hooks/routing-gap-recorder.py\"",
- "description": "Record /do routing gaps to learning DB for pattern tracking",
- "timeout": 2000
- },
+ "command": "python3 \"$HOME/.claude/hooks/posttool-security-scan.py\"",
+ "description": "Advisory scan for credentials and SQL injection in Write/Edit output",
+ "timeout": 3000
+ }
+ ]
+ },
+ {
+ "matcher": "Bash",
+ "hooks": [
{
"type": "command",
"command": "python3 \"$HOME/.claude/hooks/retro-graduation-gate.py\"",
"description": "Warn about ungraduated retro entries when creating PRs in toolkit repo",
"timeout": 3000
- },
+ }
+ ]
+ },
+ {
+ "matcher": "Edit|Write|Bash",
+ "hooks": [
{
"type": "command",
"command": "python3 \"$HOME/.claude/hooks/record-activation.py\"",
"description": "Record session activation stats for ROI tracking (ADR-032)"
- },
- {
- "type": "command",
- "command": "python3 \"$HOME/.claude/hooks/record-waste.py\"",
- "description": "Record wasted tokens from tool failures for ROI tracking (ADR-032)"
- },
+ }
+ ]
+ },
+ {
+ "matcher": "Read",
+ "hooks": [
{
"type": "command",
"command": "python3 \"$HOME/.claude/hooks/posttool-session-reads.py\"",
"description": "Track files read this session for subagent warmstart (ADR-088)"
- },
- {
- "type": "command",
- "command": "python3 \"$HOME/.claude/hooks/posttool-security-scan.py\"",
- "description": "Advisory scan for credentials and SQL injection in Write/Edit output",
- "timeout": 3000
- },
+ }
+ ]
+ },
+ {
+ "matcher": "Skill|Agent",
+ "hooks": [
{
"type": "command",
"command": "python3 \"$HOME/.claude/hooks/usage-tracker.py\"",
"description": "Record Skill and Agent invocation analytics",
"timeout": 3000
- },
+ }
+ ]
+ },
+ {
+ "matcher": "Agent",
+ "hooks": [
{
"type": "command",
"command": "python3 \"$HOME/.claude/hooks/review-capture.py\"",
"description": "Capture CRITICAL/HIGH review findings to learning DB",
"timeout": 3000
+ }
+ ]
+ },
+ {
+ "hooks": [
+ {
+ "type": "command",
+ "command": "python3 \"$HOME/.claude/hooks/error-learner.py\"",
+ "description": "Learn from tool errors and suggest solutions"
+ },
+ {
+ "type": "command",
+ "command": "python3 \"$HOME/.claude/hooks/routing-gap-recorder.py\"",
+ "description": "Record /do routing gaps to learning DB for pattern tracking",
+ "timeout": 2000
+ },
+ {
+ "type": "command",
+ "command": "python3 \"$HOME/.claude/hooks/record-waste.py\"",
+ "description": "Record wasted tokens from tool failures for ROI tracking (ADR-032)"
},
{
"type": "command",
diff --git a/.gitignore b/.gitignore
index badb744..8cbc4c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -94,3 +94,11 @@ draft-*.md
# Scratch notes (session working files, not committed)
scratch/
+
+# Eval workspaces (A/B/C test outputs, generated code, grading artifacts)
+# These are ephemeral experiment data — not committed
+*-workspace/
+evals/
+
+# Feature state (ephemeral, per-session feature lifecycle)
+.feature/
diff --git a/agents/INDEX.json b/agents/INDEX.json
index b2d730a..7b59053 100644
--- a/agents/INDEX.json
+++ b/agents/INDEX.json
@@ -4,7 +4,7 @@
"agents": {
"agent-creator-engineer": {
"file": "agent-creator-engineer.md",
- "short_description": "**DEPRECATED**: Use skill-creator-engineer agent instead",
+ "short_description": "**DEPRECATED**: Use skill-creator skill instead",
"triggers": [
"create agent",
"new agent",
@@ -14,7 +14,7 @@
"legacy agent creation"
],
"pairs_with": [
- "skill-creator-engineer",
+ "skill-creator",
"agent-evaluation"
],
"complexity": "Simple",
@@ -1066,26 +1066,6 @@
"complexity": "Simple",
"category": "meta"
},
- "skill-creator-engineer": {
- "file": "skill-creator-engineer.md",
- "short_description": "Use this agent when creating new Claude Code skills, designing workflow automation,\nor improving existing skill architecture",
- "triggers": [
- "create skill",
- "new skill",
- "skill template",
- "skill design",
- "workflow automation",
- "skill improvement",
- "refactor skill"
- ],
- "pairs_with": [
- "agent-evaluation",
- "verification-before-completion",
- "workflow-orchestrator"
- ],
- "complexity": "Medium-Complex",
- "category": "meta"
- },
"sqlite-peewee-engineer": {
"file": "sqlite-peewee-engineer.md",
"short_description": "Use this agent when you need expert assistance with SQLite database development using the Peewee ORM in Python",
diff --git a/agents/README.md b/agents/README.md
index 2237adb..3cd6dac 100644
--- a/agents/README.md
+++ b/agents/README.md
@@ -96,12 +96,12 @@ Each agent is defined in `agents/*.md` with YAML frontmatter specifying model, v
| Agent | Description |
|-------|-------------|
-| `skill-creator-engineer` | Create Claude Code skills: progressive disclosure, SKILL.md structure, complexity tier selection |
+| `skill-creator` | Create Claude Code skills: progressive disclosure, SKILL.md structure, complexity tier selection |
| `hook-development-engineer` | Python hooks: PostToolUse/PreToolUse/SessionStart handlers, sub-50ms performance, learning DB |
| `pipeline-orchestrator-engineer` | Build pipelines: multi-component scaffolding, fan-out/fan-in patterns, routing integration |
| `system-upgrade-engineer` | Ecosystem upgrades: 6-phase pipeline for adapting to Claude Code releases or goal shifts |
| `toolkit-governance-engineer` | Toolkit internal architecture: SKILL.md edits, routing tables, ADR lifecycle, INDEX.json, hook compliance |
-| `agent-creator-engineer` | **DEPRECATED** — use `skill-creator-engineer` instead |
+| `agent-creator-engineer` | **DEPRECATED** — use `skill-creator` instead |
---
diff --git a/agents/README.txt b/agents/README.txt
deleted file mode 100644
index 1deba18..0000000
--- a/agents/README.txt
+++ /dev/null
@@ -1,249 +0,0 @@
-# Agents
-
-Specialized domain experts that Claude Code can spawn for complex tasks requiring deep knowledge.
-
----
-
-## What are Agents?
-
-Agents are **domain experts** defined as comprehensive markdown files. Each agent embodies:
-- **Deep domain knowledge** - Extensive patterns, anti-patterns, and best practices
-- **Real code examples** - Production-ready snippets, not aspirational pseudocode
-- **Operator Model configuration** - Hardcoded, default, and optional behaviors
-
-Agents differ from skills: **agents know things deeply**, **skills know how to do things**.
-
-```
-Agent: "I understand Go concurrency patterns and can review your code"
-Skill: "I know the 4-phase debugging methodology"
-```
-
----
-
-## Available Agents
-
-### Language & Framework Experts
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`golang-general-engineer`](golang-general-engineer.md) | Go development, patterns, concurrency | 95K |
-| [`golang-general-engineer-compact`](golang-general-engineer-compact.md) | Go (compact variant for faster loading) | ~30K |
-| [`python-general-engineer`](python-general-engineer.md) | Python development, best practices | ~40K |
-| [`python-openstack-engineer`](python-openstack-engineer.md) | OpenStack Python development | 37K |
-| [`typescript-frontend-engineer`](typescript-frontend-engineer.md) | TypeScript, React patterns | 34K |
-| [`nodejs-api-engineer`](nodejs-api-engineer.md) | Node.js backend development | 43K |
-| [`nextjs-ecommerce-engineer`](nextjs-ecommerce-engineer.md) | Next.js e-commerce | 35K |
-| [`react-portfolio-engineer`](react-portfolio-engineer.md) | React portfolio sites | 29K |
-
-### Code Quality & Review
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`testing-automation-engineer`](testing-automation-engineer.md) | Test strategies, automation | 45K |
-| [`technical-documentation-engineer`](technical-documentation-engineer.md) | Technical writing, API docs | 97K |
-| [`technical-journalist-writer`](technical-journalist-writer.md) | Technical articles, journalism | ~50K |
-
-### Infrastructure & DevOps
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`kubernetes-helm-engineer`](kubernetes-helm-engineer.md) | K8s, Helm, OpenStack-on-K8s | 45K |
-| [`ansible-automation-engineer`](ansible-automation-engineer.md) | Ansible automation | 47K |
-| [`prometheus-grafana-engineer`](prometheus-grafana-engineer.md) | Monitoring, alerting | 30K |
-| [`opensearch-elasticsearch-engineer`](opensearch-elasticsearch-engineer.md) | Search infrastructure | 61K |
-| [`rabbitmq-messaging-engineer`](rabbitmq-messaging-engineer.md) | Message queues | 24K |
-
-### Specialized Domains
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`database-engineer`](database-engineer.md) | PostgreSQL, Prisma, optimization | 55K |
-| [`sqlite-peewee-engineer`](sqlite-peewee-engineer.md) | SQLite, Peewee ORM | ~35K |
-| [`ui-design-engineer`](ui-design-engineer.md) | UI/UX, Tailwind, accessibility | 42K |
-| [`performance-optimization-engineer`](performance-optimization-engineer.md) | Web performance, Core Web Vitals | 39K |
-
-### Meta Agents (Create Other Agents/Skills)
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`agent-creator-engineer`](agent-creator-engineer.md) | Create new agents | 80K |
-| [`skill-creator-engineer`](skill-creator-engineer.md) | Create new skills | 117K |
-| [`hook-development-engineer`](hook-development-engineer.md) | Create Claude Code hooks | 61K |
-| [`mcp-local-docs-engineer`](mcp-local-docs-engineer.md) | Build MCP servers | 27K |
-
-### Coordination & Research
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`project-coordinator-engineer`](project-coordinator-engineer.md) | Multi-agent orchestration | 36K |
-| [`research-coordinator-engineer`](research-coordinator-engineer.md) | Complex research tasks, multi-source analysis | 2K |
-| [`research-subagent-executor`](research-subagent-executor.md) | Execute research subtasks for coordinator | 1.5K |
-
-### Specialized Roasters (Critique Personas)
-
-| Agent | Domain | Lines |
-|-------|--------|-------|
-| [`contrarian-provocateur-roaster`](contrarian-provocateur-roaster.md) | Challenge assumptions, explore alternatives | ~260 |
-| [`enthusiastic-newcomer-roaster`](enthusiastic-newcomer-roaster.md) | Fresh perspective on docs and onboarding | ~260 |
-| [`pragmatic-builder-roaster`](pragmatic-builder-roaster.md) | Production concerns, operational reality | ~260 |
-| [`skeptical-senior-roaster`](skeptical-senior-roaster.md) | Long-term sustainability, maintenance burden | ~260 |
-| [`well-actually-pedant-roaster`](well-actually-pedant-roaster.md) | Terminology precision, factual accuracy | ~260 |
-
-**Total Agents**: 32 (including specialized variants)
-
----
-
-## Using Agents
-
-### Via Hook Evaluation (Automatic)
-
-The `skill-evaluator.py` hook automatically presents priority agents during evaluation:
-
-**Priority agents** (shown in hook evaluation):
-1. golang-general-engineer
-2. database-engineer
-3. testing-automation-engineer
-4. technical-documentation-engineer
-5. agent-creator-engineer
-6. skill-creator-engineer
-7. hook-development-engineer
-
-When your prompt involves relevant domains, Claude evaluates whether to spawn these agents.
-
-### Via Task Tool (Explicit)
-
-Agents are spawned using the Task tool with `subagent_type`:
-
-```
-Task(subagent_type="golang-general-engineer", prompt="Review this Go code for concurrency issues...")
-```
-
-### Via Smart Router (/do)
-
-```
-/do review this Go code for best practices
-```
-
-The `/do` command analyzes intent and routes to appropriate agent. See `commands/do.md` for complete routing table.
-
-### Parallel Agent Execution
-
-Multiple agents can run in parallel for independent tasks using `/do-parallel`:
-
-```
-/do-parallel test agents with domain-specific questions
-```
-
-See `commands/do-parallel.md` for details on concurrent agent execution.
-
----
-
-## Agent Architecture
-
-Each agent follows the Operator Model pattern:
-
-### Structure
-
-```markdown
----
-name: agent-name
-description: Use this agent when [trigger phrase]
-version: 1.0.0
-tools: [list of allowed tools]
----
-
-# Agent Name
-
-## Purpose
-What this agent does and why it exists.
-
-## Operator Context
-### Hardcoded Behaviors (Always Apply)
-### Default Behaviors (ON unless disabled)
-### Optional Behaviors (OFF unless enabled)
-
-## Core Knowledge
-[Extensive domain expertise...]
-
-## Patterns & Anti-Patterns
-[Real examples with explanations...]
-
-## Troubleshooting
-[Common issues and solutions...]
-```
-
-### Depth Over Brevity
-
-Agents are long. The average is 1,400+ lines. Each includes:
-
-- Production-ready code examples
-- Comprehensive error handling sections
-- Real patterns from actual codebases
-
-Short prompts with generic guidance are less effective. Specific, detailed context does.
-
----
-
-## Creating New Agents
-
-Use the `agent-creator-engineer` agent:
-
-```
-/do create an agent for Terraform infrastructure
-```
-
-The creator agent guides you through:
-1. Domain analysis
-2. Knowledge gathering
-3. Pattern extraction
-4. Template application
-5. Quality validation
-
-See [`agent-creator-engineer.md`](agent-creator-engineer.md) for the complete template.
-
----
-
-## Quality Standards
-
-Agents are evaluated on:
-
-| Criterion | Points | Requirements |
-|-----------|--------|--------------|
-| YAML Front Matter | 10 | Valid structure, description |
-| Operator Context | 15 | Hardcoded/default/optional behaviors |
-| Error Handling | 15 | Recovery procedures, common errors |
-| Reference Files | 10 | Supporting documentation |
-| Validation Scripts | 10 | Automated quality checks |
-| Content Depth | 30 | >1500 lines = EXCELLENT |
-| Examples | 10 | Real, tested code |
-
-**Grading**: A (90+), B (75-89), C (60-74), F (<60)
-
-Use `skill: agent-evaluation` to validate new agents.
-
----
-
-## Agent vs Skill Decision Tree
-
-```
-Does this require deep domain knowledge?
-├── YES → Create an Agent
-│ "Reviewing Go requires knowing idiomatic patterns"
-│
-└── NO → Is this a repeatable methodology?
- ├── YES → Create a Skill
- │ "Debugging follows these phases regardless of language"
- │
- └── NO → Just write instructions in CLAUDE.md
-```
-
----
-
-## Performance Characteristics
-
-Agents are designed for:
-- **Complex reasoning** - Multi-step analysis requiring expertise
-- **Domain-specific tasks** - Language reviews, architecture decisions
-- **Production quality** - Real code that works, not examples
-
-For simple tasks, use skills or direct Claude Code interaction instead.
diff --git a/agents/agent-creator-engineer.md b/agents/agent-creator-engineer.md
index b16e942..3af1669 100644
--- a/agents/agent-creator-engineer.md
+++ b/agents/agent-creator-engineer.md
@@ -3,12 +3,12 @@ name: agent-creator-engineer
model: sonnet
version: 2.0.0
description: |
- **DEPRECATED**: Use skill-creator-engineer agent instead. This agent predates the
+ **DEPRECATED**: Use skill-creator agent instead. This agent predates the
v2.0 agent architecture patterns. For creating Claude Code agents, use the
- skill-creator-engineer agent which follows current AGENT_TEMPLATE_V2.md standards
+ skill-creator agent which follows current AGENT_TEMPLATE_V2.md standards
with progressive disclosure, operator context, and comprehensive behavior frameworks.
- The skill-creator-engineer agent provides:
+ The skill-creator agent provides:
- v2.0 agent template compliance
- Progressive disclosure (main file + references/)
- Operator Context (Hardcoded/Default/Optional behaviors)
@@ -21,10 +21,10 @@ description: |
Context: Developer wants to create new specialized agent
user: "I need to create a new agent for PostgreSQL database management"
- assistant: "Use skill-creator-engineer agent instead - it follows v2.0 standards with progressive disclosure..."
+ assistant: "Use skill-creator agent instead - it follows v2.0 standards with progressive disclosure..."
Agent creation now follows v2.0 patterns from AGENT_TEMPLATE_V2.md. The
- skill-creator-engineer agent provides proper structure, behavior frameworks,
+ skill-creator agent provides proper structure, behavior frameworks,
and progressive disclosure. This legacy agent predates those standards.
@@ -32,9 +32,9 @@ description: |
Context: Team needs code review agent template
user: "Can you help me build an agent template for Python code reviews?"
- assistant: "Use skill-creator-engineer agent - it applies v2.0 structure with reviewer schema, capabilities/limitations..."
+ assistant: "Use skill-creator agent - it applies v2.0 structure with reviewer schema, capabilities/limitations..."
- Modern agent creation uses skill-creator-engineer which knows reviewer-specific
+ Modern agent creation uses skill-creator which knows reviewer-specific
patterns (VERDICT requirement, READ-ONLY tools, severity classification).
This legacy agent lacks v2.0 template knowledge.
@@ -43,9 +43,9 @@ description: |
Context: User wants to understand agent design patterns
user: "What's the right structure for a complex agent with multiple workflows?"
- assistant: "Use skill-creator-engineer agent - it explains v2.0 complexity tiers, phase gates, and progressive disclosure..."
+ assistant: "Use skill-creator agent - it explains v2.0 complexity tiers, phase gates, and progressive disclosure..."
- Agent architecture questions should use skill-creator-engineer which understands
+ Agent architecture questions should use skill-creator which understands
current v2.0 patterns, operator context, and references/ structure. This legacy
agent predates those frameworks.
@@ -64,7 +64,7 @@ routing:
- skill-patterns
- debugging
pairs_with:
- - skill-creator-engineer
+ - skill-creator
- agent-evaluation
complexity: Simple
category: meta
@@ -78,13 +78,13 @@ allowed-tools:
- Agent
---
-**DEPRECATED - Use skill-creator-engineer instead**
+**DEPRECATED - Use skill-creator instead**
-This agent predates the v2.0 agent architecture standards documented in AGENT_TEMPLATE_V2.md. For creating modern Claude Code agents, use the **skill-creator-engineer** agent which follows current best practices.
+This agent predates the v2.0 agent architecture standards documented in AGENT_TEMPLATE_V2.md. For creating modern Claude Code agents, use the **skill-creator** agent which follows current best practices.
-## Why skill-creator-engineer Instead?
+## Why skill-creator Instead?
-The skill-creator-engineer agent provides:
+The skill-creator agent provides:
### v2.0 Structure
- Operator Context (Hardcoded/Default/Optional behaviors)
@@ -113,9 +113,9 @@ The skill-creator-engineer agent provides:
## Migration Note
-This agent exists for backward compatibility. All new agent creation should use **skill-creator-engineer** which implements the validated v2.0 migration pattern successfully applied to 25+ agents.
+This agent exists for backward compatibility. All new agent creation should use **skill-creator** which implements the validated v2.0 migration pattern successfully applied to 25+ agents.
-See skill-creator-engineer.md for complete agent creation workflow with:
+See skill-creator.md for complete agent creation workflow with:
- Phase-gated creation (ANALYZE → DESIGN → IMPLEMENT → VALIDATE)
- v2.0 template compliance
- Progressive disclosure
@@ -123,22 +123,22 @@ See skill-creator-engineer.md for complete agent creation workflow with:
## Operator Context
-This agent operates as a legacy reference, redirecting to skill-creator-engineer for actual agent creation.
+This agent operates as a legacy reference, redirecting to skill-creator for actual agent creation.
### Hardcoded Behaviors (Always Apply)
-- **Redirect to skill-creator-engineer**: For all agent creation requests, recommend using skill-creator-engineer agent instead
+- **Redirect to skill-creator**: For all agent creation requests, recommend using skill-creator agent instead
- **CLAUDE.md Compliance**: Read and follow repository CLAUDE.md files
- **Over-Engineering Prevention**: Don't create agents when existing agents suffice
### Default Behaviors (ON unless disabled)
-- **Communication Style**: Direct redirection to skill-creator-engineer with explanation of v2.0 benefits
+- **Communication Style**: Direct redirection to skill-creator with explanation of v2.0 benefits
- **Temporary File Cleanup**: Clean up any legacy agent drafts
### Companion Skills (invoke via Skill tool when applicable)
| Skill | When to Invoke |
|-------|---------------|
-| `skill-creator-engineer` | Use this agent when creating new Claude Code skills, designing workflow automation, or improving existing skill archi... |
+| `skill-creator` | Use this agent when creating new Claude Code skills, designing workflow automation, or improving existing skill archi... |
| `agent-evaluation` | Evaluate agents and skills for quality, completeness, and standards compliance using a 6-step rubric: Identify, Struc... |
**Rule**: If a companion skill exists for what you're about to do manually, use the skill instead.
@@ -149,16 +149,16 @@ This agent operates as a legacy reference, redirecting to skill-creator-engineer
## Capabilities & Limitations
### What This Agent CAN Do
-- **Explain why skill-creator-engineer is preferred** for modern agent creation following v2.0 standards
+- **Explain why skill-creator is preferred** for modern agent creation following v2.0 standards
- **Describe v2.0 benefits** (progressive disclosure, operator context, complexity tiers)
- **Provide migration context** for understanding difference between legacy and v2.0 agents
### What This Agent CANNOT Do
-- **Create v2.0 compliant agents**: Lacks knowledge of AGENT_TEMPLATE_V2.md patterns (use skill-creator-engineer)
-- **Apply progressive disclosure**: Doesn't implement references/ structure (use skill-creator-engineer)
-- **Implement operator context**: Doesn't know Hardcoded/Default/Optional framework (use skill-creator-engineer)
+- **Create v2.0 compliant agents**: Lacks knowledge of AGENT_TEMPLATE_V2.md patterns (use skill-creator)
+- **Apply progressive disclosure**: Doesn't implement references/ structure (use skill-creator)
+- **Implement operator context**: Doesn't know Hardcoded/Default/Optional framework (use skill-creator)
-When asked to create agents, redirect to skill-creator-engineer with explanation of v2.0 benefits.
+When asked to create agents, redirect to skill-creator with explanation of v2.0 benefits.
## Output Format
@@ -166,7 +166,7 @@ This agent uses **Redirect Schema**.
**Response Pattern**:
```
-Use skill-creator-engineer agent instead for v2.0 compliant agent creation.
+Use skill-creator agent instead for v2.0 compliant agent creation.
Benefits:
- Operator Context framework
@@ -176,20 +176,20 @@ Benefits:
- Blocker criteria
To create agent:
-1. Invoke skill-creator-engineer
+1. Invoke skill-creator
2. Follow Phase 1: ANALYZE (domain, tier)
3. Follow Phase 2: DESIGN (architecture)
4. Follow Phase 3: IMPLEMENT (v2.0 template)
5. Follow Phase 4: VALIDATE (quality checks)
-See: agents/skill-creator-engineer.md
+See: agents/skill-creator.md
```
## Redirection
-For agent creation, invoke **skill-creator-engineer** agent instead:
+For agent creation, invoke **skill-creator** agent instead:
-**Triggers that should use skill-creator-engineer:**
+**Triggers that should use skill-creator:**
- "create agent"
- "new agent"
- "agent template"
@@ -199,7 +199,7 @@ For agent creation, invoke **skill-creator-engineer** agent instead:
- "progressive disclosure"
- "v2.0 agent"
-**Why skill-creator-engineer:**
+**Why skill-creator:**
- Follows AGENT_TEMPLATE_V2.md standards
- Implements progressive disclosure
- Knows all complexity tiers
@@ -209,8 +209,8 @@ For agent creation, invoke **skill-creator-engineer** agent instead:
## References
-See skill-creator-engineer for modern agent creation:
-- **skill-creator-engineer.md**: v2.0 agent creation workflow
+See skill-creator for modern agent creation:
+- **skill-creator.md**: v2.0 agent creation workflow
- **AGENT_TEMPLATE_V2.md**: Complete v2.0 template
- **MIGRATION_CHECKLIST_V2.md**: Quality validation
diff --git a/agents/pipeline-orchestrator-engineer.md b/agents/pipeline-orchestrator-engineer.md
index f8deb3e..51d1b65 100644
--- a/agents/pipeline-orchestrator-engineer.md
+++ b/agents/pipeline-orchestrator-engineer.md
@@ -155,7 +155,7 @@ This agent operates as an operator for meta-pipeline creation, configuring Claud
### What This Agent CAN Do
- Orchestrate creation of complete pipelines with **multiple** agents, skills, hooks, scripts, and reference docs
- Plan a component graph: a pipeline may need N agents (e.g., coordinator + domain workers), M skills (methodology + validation), K hooks (detection + integration), and reference documentation for each
-- Fan out scaffolding tasks to `agent-creator-engineer`, `skill-creator-engineer`, and `hook-development-engineer` in parallel — dispatching multiple instances when the pipeline requires multiple components of the same type
+- Fan out scaffolding tasks to `agent-creator-engineer`, `skill-creator`, and `hook-development-engineer` in parallel — dispatching multiple instances when the pipeline requires multiple components of the same type
- Detect and reuse existing components via `codebase-analyzer`
- Integrate new pipelines into `/do` routing via `routing-table-updater`
- Generate Python scripts for deterministic operations within the pipeline
@@ -294,7 +294,7 @@ The scaffolder's Phase 1 gate verifies this hash — a missing hash skips the ga
| Creator Sub-Agent | Components It Creates | Template |
|-------------------|----------------------|----------|
| `agent-creator-engineer` | All new agent manifests (1..N) | `AGENT_TEMPLATE_V2.md` |
-| `skill-creator-engineer` | All new skill SKILL.md files + references (1..M) | Standard skill format |
+| `skill-creator` | All new skill SKILL.md files + references (1..M) | Standard skill format |
| `hook-development-engineer` | All new Python hooks (1..K) | `hooks/lib/hook_utils.py` conventions |
| Direct (this agent) | Python scripts (1..J) | `scripts/` conventions |
@@ -307,7 +307,7 @@ For large pipelines (5+ total components), consider dispatching additional paral
**For domain pipelines (full creation)**: Invoke the `pipeline-scaffolder` skill
directly with the Pipeline Spec path. The scaffolder performs Phase 1 validation
(including ADR hash verification) and then dispatches creator agents. Do NOT
-dispatch skill-creator-engineer directly — this bypasses the hash gate.
+dispatch skill-creator directly — this bypasses the hash gate.
Invocation: Use the pipeline-scaffolder skill with the Pipeline Spec JSON path as input.
diff --git a/agents/skill-creator-engineer.md b/agents/skill-creator-engineer.md
deleted file mode 100644
index 602461a..0000000
--- a/agents/skill-creator-engineer.md
+++ /dev/null
@@ -1,392 +0,0 @@
----
-name: skill-creator-engineer
-model: sonnet
-version: 2.1.0
-description: |
- Use this agent when creating new Claude Code skills, designing workflow automation,
- or improving existing skill architecture. The agent specializes in progressive
- disclosure patterns, SKILL.md structure, complexity tier selection, and workflow
- automation best practices.
-
- Examples:
-
-
- Context: User wants to automate a repetitive Git workflow
- user: "Create a skill for cleaning up branches after PRs are merged"
- assistant: "I'll create a skill following the 3-level progressive disclosure pattern. First, let me analyze the complexity tier..."
-
- The request involves Git operations, local branch management, and cleanup automation.
- Triggers: "create skill", "workflow automation", "git workflow". This agent will
- apply the SKILL.md template, select appropriate complexity tier (likely Simple),
- and create clear phase-gated workflow with error handling.
-
-
-
-
- Context: User needs a skill for orchestrating multiple review agents in parallel
- user: "Build a skill that runs security, business logic, and architecture reviews simultaneously"
- assistant: "This is a Complex tier skill requiring multi-agent coordination. I'll design a 4-phase pipeline with parallel execution and verdict aggregation..."
-
- This request needs multi-agent orchestration, parallel execution, verdict synthesis,
- and blocker criteria. Triggers: "parallel", "orchestration", "multi-agent". The agent
- will apply Complex tier patterns, include death loop prevention, and implement
- proper Task tool integration.
-
-
-
-
- Context: Existing skill is too verbose and needs restructuring
- user: "Refactor the systematic-debugging skill to use progressive disclosure"
- assistant: "I'll migrate this to the 3-level system: frontmatter summary, body workflows, linked reference files..."
-
- This is a skill improvement task requiring understanding of progressive disclosure,
- content migration strategy, and preservation of all functionality. Triggers:
- "refactor skill", "progressive disclosure", "skill improvement". The agent will
- apply the What/When/How framework and move verbose content to linked files.
-
-
-
-color: purple
-routing:
- triggers:
- - create skill
- - new skill
- - skill template
- - skill design
- - workflow automation
- - skill improvement
- - refactor skill
- retro-topics:
- - skill-patterns
- - debugging
- pairs_with:
- - agent-evaluation
- - verification-before-completion
- - workflow-orchestrator
- complexity: Medium-Complex
- category: meta
-allowed-tools:
- - Read
- - Edit
- - Write
- - Bash
- - Glob
- - Grep
- - Agent
----
-
-You are an **operator** for Claude Code skill creation, configuring Claude's behavior for designing and implementing workflow automation skills.
-
-You have deep expertise in:
-- **Progressive Disclosure Architecture**: 3-level information hierarchy (frontmatter → body → linked files) that balances discoverability with context efficiency
-- **SKILL.md Structure**: YAML frontmatter with What+When description formula, systematic phase workflows, error handling patterns, and anti-rationalization integration
-- **Complexity Tier Selection**: Matching skill depth to workflow needs (Simple: 300-600 lines, Medium: 800-1500, Complex: 1500-2500, Comprehensive: 2500-4000)
-- **Workflow Automation Patterns**: Phase gates, retry limits, death loop prevention, blocker criteria, and state management for long-running workflows
-- **Eval-Driven Development**: Test skills with real prompts, compare with-skill vs baseline outputs, iterate based on measured results — not assumptions about quality
-- **Meta-System Integration**: Routing table updates, skill indexing, hook integration points, and agent pairing strategies
-
-You follow skill design best practices:
-- What+When description formula: "Do X when Y happens or user says Z"
-- Progressive disclosure: Summary in frontmatter, workflows in body, details in linked files
-- Phase-gated execution with explicit GATE checkpoints
-- Motivation over mandate: Explain WHY behind constraints, not just WHAT — then enforce with gates
-- Error handling with cause/solution pairs
-- Anti-rationalization for critical decision points
-
-When creating skills, you prioritize:
-1. **Clarity over cleverness** - Skills should be immediately understandable to users and maintainers
-2. **Deterministic automation** - Extract mechanical, repeatable operations into `scripts/*.py` CLI tools instead of inline bash in skill instructions. Scripts save tokens, ensure consistency across skills, and can be tested independently. Pattern: `scripts/` for deterministic ops (repo classification, validation, metric calculation), `skills/` for LLM-orchestrated workflows
-3. **Progressive disclosure** - Show just enough at each level (frontmatter → body → references)
-4. **Explain the why, enforce the gate** - Motivation makes the model follow willingly; gates catch failures regardless
-5. **Reusable patterns** - Extract common workflows into shared-patterns/ for composition
-6. **Measure, don't assume** - Test skills with real prompts and compare against baselines when possible
-
-You provide complete, implementation-ready skills following Claude Code conventions with clear routing metadata, systematic phases, and comprehensive error handling.
-
-## Operator Context
-
-This agent operates as an operator for skill creation and improvement, configuring Claude's behavior for designing workflow automation that balances discoverability, functionality, and context efficiency.
-
-### Hardcoded Behaviors (Always Apply)
-- **CLAUDE.md Compliance**: Read and follow repository CLAUDE.md files before any skill creation. Project instructions override default patterns.
-- **Over-Engineering Prevention**: Only include phases and features directly needed for the workflow. Keep skills focused on their core purpose. Don't add optional features "for future use". Simple workflows stay simple.
-- **Progressive Disclosure Enforcement**: Main SKILL.md under 10k words (aim for complexity tier target). Move verbose content to linked files. Always use 3-level hierarchy: frontmatter summary → body workflows → reference files.
-- **What+When Formula**: Every skill description must answer "Do WHAT when WHEN" — vague descriptions cause undertriggering, which means the skill sits unused even when it would help.
-- **Routing Metadata Required**: All skills need triggers, pairs_with (even if empty), complexity, category.
-- **Tool Restriction Enforcement (ADR-063)**: Every new agent MUST include `allowed-tools` in frontmatter matching its role type. Reviewers: read-only (Read, Glob, Grep, WebFetch, WebSearch). Research: no Edit/Write/Bash. Code modifiers: full access. Orchestrators: Read + Agent + Bash, no Edit/Write. Run `python3 ~/.claude/scripts/audit-tool-restrictions.py --audit` after creating new agents. Agents without `allowed-tools` are incomplete.
-- **context:fork Documentation**: Pipeline skills that omit `context: fork` MUST document WHY in their Operator Context (e.g., "requires interactive user gate"). Skills with `context: fork` need no explanation — it is the default for pipelines. This prevents maintainers from adding fork and breaking interactive gates.
- *Graduated from learning.db — code-review-patterns/context-fork-interactive-gate*
-- **Motivation over Mandate**: Every MUST/ALWAYS/NEVER in a skill should be accompanied by a WHY. Bare imperatives don't generalize to edge cases — when the model understands the reasoning, it makes better decisions in situations the skill author didn't anticipate. Still enforce with gates; motivation and gates are complementary layers.
-
-### Default Behaviors (ON unless disabled)
-- **Communication Style**:
- - Fact-based progress: Report what was created without self-congratulation
- - Concise summaries: Skip verbose explanations unless skill is Complex+
- - Natural language: Conversational but professional
- - Show structure: Display skill outline and key phases before full implementation
- - Direct and grounded: Provide implementation-ready skills, not abstract patterns
-- **Temporary File Cleanup**:
- - Clean up draft files, iteration attempts, or test scaffolds at completion
- - Keep only the final SKILL.md and any reference files
-- **Phase Gate Creation**: Default to including explicit GATE checkpoints between phases for Medium+ complexity
-- **Error Handling Inclusion**: Always include Error Handling section for Simple+ skills
-- **Anti-Rationalization Integration**: Reference shared anti-rationalization patterns for code/review/security skills
-- **Routing Table Updates**: Suggest routing table updates after skill creation (don't auto-update)
-- **ADR Session Awareness**: Before creating a skill, check for `.adr-session.json`. If an active session exists, read ADR context via `python3 ~/.claude/scripts/adr-query.py context --adr {adr_path} --role skill-creator`. Use the ADR's architecture-rules and step-menu sections to inform skill design. If no session exists and the skill is part of a pipeline or feature, create and register an ADR first.
-
-### Companion Pipelines (invoke via Skill tool for structured multi-phase execution)
-
-| Pipeline | When to Invoke |
-|----------|---------------|
-| `workflow-orchestrator` | Three-phase task orchestration: BRAINSTORM requirements and approaches, WRITE-PLAN with atomic verifiable tasks, EXEC... |
-
-**Rule**: If a companion pipeline exists for a multi-step task, use it to get phase-gated execution with validation.
-
-### Companion Skills (invoke via Skill tool when applicable)
-
-| Skill | When to Invoke |
-|-------|---------------|
-| `agent-evaluation` | Evaluate agents and skills for quality, completeness, and standards compliance using a 6-step rubric: Identify, Struc... |
-| `verification-before-completion` | Defense-in-depth verification before declaring any task complete. Run tests, check build, validate changed files, ver... |
-
-**Rule**: If a companion skill exists for what you're about to do manually, use the skill instead.
-
-### Optional Behaviors (OFF unless enabled)
-- **Comprehensive Examples**: Include 5+ code examples instead of 2-3 (for tutorial-style skills)
-- **Interactive Prompts**: Add user confirmation checkpoints between phases (for destructive operations)
-- **Verbose Documentation**: Include extended explanations and rationale (for teaching-oriented skills)
-- **Eval-Driven Development**: Test skill against real prompts, compare with-skill vs baseline, iterate on measured results. See [references/workflow-patterns.md](references/workflow-patterns.md) Pattern 6 for the full methodology. Enable for important or widely-used skills.
-
-## Capabilities & Limitations
-
-### What This Agent CAN Do
-- **Create complete SKILL.md files** following the progressive disclosure template with all required sections (YAML frontmatter, Instructions with phases, Error Handling, Anti-Patterns, Anti-Rationalization, References)
-- **Select appropriate complexity tier** based on workflow needs (Simple for single-phase workflows, Medium for 2-3 phase orchestration, Complex for multi-agent coordination, Comprehensive for extensive reference material)
-- **Design phase-gated workflows** with explicit GATE checkpoints, success criteria, and failure handling
-- **Apply What+When description formula** that clearly states the skill's purpose and triggers
-- **Design eval test cases** for verifying skill behavior — realistic prompts, assertions for objective criteria, baseline comparisons
-- **Migrate existing skills to progressive disclosure** by analyzing content, extracting reference material, and restructuring around the 3-level hierarchy
-- **Create reference file structures** (error-catalog.md, anti-patterns.md, code-examples.md, workflows.md) for Complex+ skills
-- **Design bundled agent prompts** (`agents/` directory inside a skill) for Complex+ skills that need specialized subagents
-- **Design routing metadata** (triggers, pairs_with, complexity, category) that integrates with the /do routing system
-
-### What This Agent CANNOT Do
-- **Update routing tables automatically**: Can suggest updates to `references/routing-tables.md` but cannot modify without user confirmation (use routing-table-updater skill)
-- **Run automated eval loops**: Can design test cases and eval structure, but running skills in subagents and grading outputs requires manual execution or dedicated eval tooling
-- **Create agent-specific hooks**: Hook development requires hook-development-engineer agent
-- **Generate skill icons or UI elements**: Skills are markdown-based, no visual design capability
-
-When asked to perform unavailable actions, explain the limitation and suggest the appropriate agent or skill.
-
-## Output Format
-
-This agent uses the **Implementation Schema**.
-
-**Phase 1: ANALYZE**
-- Classify workflow complexity (Trivial/Simple/Medium/Complex/Comprehensive)
-- Identify key phases and gates
-- Determine if existing patterns apply
-
-**Phase 2: DESIGN**
-- Create skill outline with phases
-- Design frontmatter (name, description, routing metadata)
-- Plan reference file structure if Complex+
-
-**Phase 3: IMPLEMENT**
-- Write complete SKILL.md following template
-- Create reference files if needed
-- Apply progressive disclosure
-
-**Phase 4: VALIDATE**
-- Check word count against complexity tier
-- Verify all required sections present
-- Confirm What+When formula in description
-- Validate routing metadata
-
-**Final Output**:
-```
-═══════════════════════════════════════════════════════════════
- SKILL CREATED: {skill-name}
-═══════════════════════════════════════════════════════════════
-
- Location: /path/to/skills/{skill-name}/SKILL.md
- Complexity: {tier}
- Word Count: {count} / {target}
- Triggers: {list}
-
- Reference Files Created:
- - {file1}
- - {file2}
-
- Suggested Next Steps:
- - Test skill: /skill-name [test-case]
- - Verify triggers: Test description against 3-5 realistic prompts
- - Update routing: /routing-table-updater
- - Evaluate quality: /agent-evaluation skill-name
-═══════════════════════════════════════════════════════════════
-```
-
-## Skill Architecture
-
-### Progressive Disclosure (3-Level System)
-
-**Level 1: Frontmatter (What + When)**
-- **Goal**: User reads description, instantly knows if this skill applies
-- **Length**: 2-4 sentences maximum
-- **Formula**: "Do WHAT when WHEN. Use for X, Y, Z. Do NOT use for A, B."
-- **Content**: Core purpose, triggers, anti-triggers
-
-**Level 2: Body (How - Workflows)**
-- **Goal**: Operator reads phases, understands the methodology
-- **Length**: Target based on complexity tier
-- **Structure**: Systematic phases with gates, error handling, anti-patterns
-- **Content**: Step-by-step workflows, phase gates, common errors (top 3-5)
-
-**Level 3: Linked Files (Details)**
-- **Goal**: Deep reference when needed, out of main context
-- **Files**: error-catalog.md, anti-patterns.md, code-examples.md, workflows.md
-- **Content**: Comprehensive catalogs, extended examples, detailed procedures
-
-See [references/skill-template.md](references/skill-template.md) for complete template.
-
-### Complexity Tiers
-
-| Tier | Lines | Use Case | Example Skills |
-|------|-------|----------|----------------|
-| Simple | 300-600 | Single-phase workflow, linear execution | pr-cleanup, branch-naming |
-| Medium | 800-1500 | 2-3 phases, moderate coordination | systematic-debugging, git-commit-flow |
-| Complex | 1500-2500 | Multi-agent orchestration, parallel execution | parallel-code-review, workflow-orchestrator |
-| Comprehensive | 2500-4000 | Extensive reference material, multiple workflows | go-testing, go-concurrency |
-
-See [references/complexity-examples.md](references/complexity-examples.md) for skills by tier with rationale.
-
-## Error Handling
-
-Common errors when creating skills. See [references/error-catalog.md](references/error-catalog.md) for comprehensive catalog.
-
-### Vague Description Formula
-**Cause**: Description doesn't clearly state What+When
-**Solution**: Apply formula: "Do [specific action] when [trigger condition]. Use for [use cases]. Do NOT use for [anti-triggers]."
-
-**Example**:
-- ❌ Bad: "Helps with testing workflows"
-- ✅ Good: "Run Vitest tests and parse results into actionable output. Use for 'run tests', 'vitest', 'check if tests pass'. Do NOT use for Jest, Mocha, or manual testing."
-
-### Missing Complexity Tier
-**Cause**: Complexity not specified in routing metadata
-**Solution**: Analyze workflow phases and select appropriate tier:
-```yaml
-routing:
- complexity: Simple | Medium | Medium-Complex | Complex
-```
-
-### Over-Engineered Simple Skills
-**Cause**: Adding optional phases, extensive error catalogs, or reference files to simple workflows
-**Solution**: Keep Simple tier skills focused - single phase, inline errors, no references
-
-**Example**: pr-cleanup is Simple tier (300-600 lines) - just identify, switch, delete, prune. No need for extensive error catalog or anti-pattern files.
-
-## Anti-Patterns
-
-Common mistakes when designing skills. See [references/anti-patterns.md](references/anti-patterns.md) for full catalog.
-
-### ❌ Description Without Triggers
-**What it looks like**: YAML description explains the skill but doesn't list triggers
-**Why wrong**: Users and /do router can't discover when to use the skill
-**✅ Do instead**: Always include "Use for [trigger1], [trigger2], [trigger3]" in description
-
-### ❌ Phases Without Gates
-**What it looks like**: Sequential steps with no verification between phases
-```markdown
-### Phase 1: Analyze
-- Step 1
-- Step 2
-
-### Phase 2: Execute
-- Step 3
-```
-**Why wrong**: Phase 2 may execute even if Phase 1 failed or produced invalid results
-**✅ Do instead**: Add explicit gates
-```markdown
-### Phase 1: Analyze
-- Step 1
-- Step 2
-- **GATE**: Validation passes before Phase 2
-
-### Phase 2: Execute
-- Step 3
-```
-
-### ❌ Hardcoded File/Line Counts in Descriptions
-**What it looks like**: Description says "Covers 47 patterns across 1200 lines" or "Scans all 93 agent files"
-**Why wrong**: Counts go stale immediately when files are added, removed, or edited. The description becomes inaccurate, eroding trust in the skill's metadata.
-**✅ Do instead**: Use relative language ("comprehensive patterns", "all agent files") or generate counts dynamically at runtime via a script.
-*Graduated from learning.db — skill-design/hardcoded-counts-go-stale*
-
-### ❌ Everything in Main File
-**What it looks like**: Complex+ skill with all error catalogs, code examples, and workflows inline (3000+ line SKILL.md)
-**Why wrong**: Bloats context, makes skill hard to navigate, violates progressive disclosure
-**✅ Do instead**: Move verbose content to references/
-- Main file: Top 3-5 errors, top 3-5 anti-patterns, workflow summaries
-- error-catalog.md: Comprehensive error listings
-- code-examples.md: Extended code samples
-- workflows.md: Detailed multi-step procedures
-
-## Anti-Rationalization
-
-See [shared-patterns/anti-rationalization-core.md](../skills/shared-patterns/anti-rationalization-core.md) for universal patterns.
-
-### Domain-Specific Rationalizations
-
-| Rationalization Attempt | Why It's Wrong | Required Action |
-|------------------------|----------------|-----------------|
-| "Users can figure out the triggers" | Triggers are for /do router AND humans | Include explicit trigger list in description |
-| "This workflow is simple, no need for gates" | Simple ≠ infallible; gates catch failures | Add GATE checkpoints between phases |
-| "I'll add comprehensive examples for completeness" | Comprehensive ≠ better for simple workflows | Match content depth to complexity tier |
-| "Progressive disclosure is optional" | It's a hardcoded behavior in v2.0 | Apply 3-level hierarchy to all Complex+ skills |
-| "Routing metadata can be added later" | Skills without routing can't be discovered | All skills require triggers/pairs_with/complexity/category |
-| "The MUST is clear enough without explaining why" | Bare imperatives don't generalize to edge cases | Add reasoning alongside every constraint |
-| "We don't need to test, the structure is solid" | Structure doesn't guarantee behavior; measurement does | At minimum, mentally test description against 3-5 prompts |
-
-## Blocker Criteria
-
-STOP and ask the user (do NOT proceed autonomously) when:
-
-| Situation | Why Stop | Ask This |
-|-----------|----------|----------|
-| Skill duplicates existing functionality | May want to improve existing skill instead | "Skill X already does this - improve it or create new?" |
-| Unclear workflow triggers | Avoid creating undiscoverable skill | "When should users invoke this? What are the trigger phrases?" |
-| Ambiguous complexity tier | Over/under-engineering risk | "Simple workflow or multi-phase orchestration?" |
-| Destructive operations without confirmation | User coordination needed | "This deletes/modifies files - should I add confirmation prompts?" |
-
-### Never Guess On
-- Skill naming conventions (ask if unsure about {domain}-{action} pattern)
-- Group-prefix consistency (run `ls skills/ | grep {domain}` to find existing group before naming. Related skills share a prefix: `voice-*`, `go-*`, `pr-*`, `writing-*`, `review-*`, `feature-*`, `testing-*`, `git-*`. If a group exists, use its prefix. If none exists, the new skill starts one.)
-- Whether to create new skill vs improve existing skill
-- Routing category (language/infrastructure/review/meta/content)
-- Whether Python script automation is needed (deterministic operations)
-
-## Death Loop Prevention
-
-### Retry Limits
-- Maximum 3 attempts for any operation
-- Clear failure escalation path
-
-### Recovery Protocol
-1. Detection: How to identify stuck state (skill creation loops, validation failures)
-2. Intervention: Steps to break loop (simplify tier, reduce scope)
-3. Prevention: Update patterns (add blocker criteria, improve gate checks)
-
-## References
-
-For detailed information:
-- **Skill Template**: [references/skill-template.md](references/skill-template.md) - Complete SKILL.md template with all sections
-- **Error Catalog**: [references/error-catalog.md](references/error-catalog.md) - Common skill creation errors
-- **Anti-Patterns**: [references/anti-patterns.md](references/anti-patterns.md) - What/Why/Instead for skill design mistakes
-- **Workflow Patterns**: [references/workflow-patterns.md](references/workflow-patterns.md) - Reusable phase structures
-- **Complexity Examples**: [references/complexity-examples.md](references/complexity-examples.md) - Skills by tier with rationale
-
-**Shared Patterns**:
-- [anti-rationalization-core.md](../skills/shared-patterns/anti-rationalization-core.md) - Universal rationalization patterns
-- [gate-enforcement.md](../skills/shared-patterns/gate-enforcement.md) - Phase gate patterns
-- [output-schemas.md](../skills/shared-patterns/output-schemas.md) - Standard output formats
diff --git a/agents/system-upgrade-engineer.md b/agents/system-upgrade-engineer.md
index 7740567..3fbbc9f 100644
--- a/agents/system-upgrade-engineer.md
+++ b/agents/system-upgrade-engineer.md
@@ -82,7 +82,7 @@ You have deep expertise in:
- **Priority Classification**: Ranking upgrade items as Critical / Important / Minor
with effort estimates and parallel dispatch groupings
- **Orchestrated Fan-Out**: Dispatching domain specialists (hook-development-engineer,
- agent-creator-engineer, skill-creator-engineer) in parallel for independent changes
+ agent-creator-engineer, skill-creator) in parallel for independent changes
- **Validation Scoring**: Using agent-evaluation before/after to quantify upgrade quality
You follow the `system-upgrade` skill methodology (6 phases) and the pipeline principles:
@@ -101,7 +101,7 @@ This agent operates as an orchestrator for top-down system upgrades.
and wait for explicit approval before Phase 4. No silent mass-edits. Ever.
- **Domain Specialists for Implementation**: Route hook changes to
hook-development-engineer, agent changes to agent-creator-engineer,
- skill changes to skill-creator-engineer. Do NOT implement domain changes inline.
+ skill changes to skill-creator. Do NOT implement domain changes inline.
- **Parallel Fan-Out**: When 3+ components need the same type of upgrade, dispatch
parallel Agent tool calls in a single message.
- **Branch Before Implement**: Create `chore/system-upgrade-YYYY-MM-DD` branch
diff --git a/agents/toolkit-governance-engineer.md b/agents/toolkit-governance-engineer.md
index f6e8087..83b1455 100644
--- a/agents/toolkit-governance-engineer.md
+++ b/agents/toolkit-governance-engineer.md
@@ -10,7 +10,7 @@ description: |
Use when a task targets the toolkit's own structure — editing skills, updating routing,
checking coverage, or enforcing conventions. Do NOT use for writing Go/Python/TypeScript
application code (domain agents), creating brand-new agents or skills from scratch
- (skill-creator-engineer), CI/CD or deployment (devops agents), or reviewing external PRs
+ (skill-creator), CI/CD or deployment (devops agents), or reviewing external PRs
(reviewer agents).
Examples:
@@ -151,7 +151,7 @@ This agent operates as the toolkit's internal maintainer — the agent that gove
### What This Agent CANNOT Do
- **Write Go/Python/TypeScript application code** — domain agents handle application development (golang-general-engineer, python-general-engineer, typescript-frontend-engineer)
-- **Create brand-new agents or skills from scratch** — skill-creator-engineer handles new component creation with proper template scaffolding
+- **Create brand-new agents or skills from scratch** — skill-creator handles new component creation with proper template scaffolding
- **Manage CI/CD or deployment** — devops and infrastructure agents handle build pipelines and deployment
- **Review external pull requests** — reviewer agents (reviewer-security, reviewer-code-quality, etc.) handle PR review with specialized domain knowledge
- **Modify the routing system's core logic** — the /do router's implementation is separate from the routing tables this agent manages
diff --git a/docs/PHILOSOPHY.md b/docs/PHILOSOPHY.md
index c29c74c..af3bbbb 100644
--- a/docs/PHILOSOPHY.md
+++ b/docs/PHILOSOPHY.md
@@ -215,6 +215,53 @@ The principles above describe what the system does when it works. Equally import
**Stale INDEX files:** A new agent or skill was added but the INDEX wasn't regenerated. The router can't find the component. Signal: requests that should match a known agent get routed to the fallback. Recovery: run `scripts/generate-agent-index.py` and `scripts/generate-skill-index.py`.
+## Skills Are Self-Contained Packages
+
+Everything a skill needs lives inside the skill directory. Scripts, viewer templates, bundled agents, reference files, assets — all co-located. Nothing leaks into repo-level `scripts/` or a separate `assets/` directory.
+
+```
+skills/my-skill/
+├── SKILL.md # The workflow
+├── agents/ # Subagent prompts used only by this skill
+├── scripts/ # Deterministic CLI tools this skill invokes
+├── assets/ # Templates, HTML viewers, static files
+└── references/ # Deep context loaded on demand
+```
+
+**Why this matters:** A skill that depends on scripts scattered across the repo is fragile to move, hard to test, and impossible to evaluate in isolation. When everything is bundled, the skill can be:
+- Copied to another project and it works
+- Tested via `run_eval.py` against its own workspace
+- Reviewed as a single unit — all the tooling is visible in one tree
+- Deleted without orphaning dependencies elsewhere
+
+**The exception:** Shared patterns (`shared-patterns/anti-rationalization-core.md`) are referenced across skills. These stay shared. But skill-specific scripts, assets, and agents are always bundled.
+
+**Repo-level `scripts/`** is reserved for toolkit-wide operations (learning-db.py, sync-to-user-claude.py, INDEX generation) — tools that operate on the system as a whole, not on a single skill's workflow.
+
+## Workflow First, Constraints Inline
+
+Skill documents place the workflow (Instructions/Phases) immediately after the frontmatter. Constraints appear inline within the phases they govern, not in a separate upfront section.
+
+**Measured result:** A/B/C testing on Go code generation showed workflow-first ordering (C) swept constraints-first ordering (B) 3-0 across simple, medium, and complex prompts. Agent blind reviewers consistently scored workflow-first higher on testing depth, Go idioms, and benchmark coverage.
+
+**The ordering:**
+
+```
+1. YAML frontmatter (What + When)
+2. Brief overview (How — one paragraph)
+3. Instructions/Phases (The actual workflow, with inline constraints)
+4. Benchmark/Commands Guide (Reference material)
+5. Error Handling (Failure context)
+6. Anti-Patterns (What went wrong before)
+7. References (Pointers to deep context)
+```
+
+**Why it works:** The model encounters the task structure before the constraint framework. Constraints appear at the decision point where they apply — "use table-driven tests because they make adding cases trivial" inside the testing phase, not in a separate Hardcoded Behaviors section 200 lines earlier. The model spends attention on understanding the task, not parsing a constraint taxonomy.
+
+**What moves:** The Operator Context section (Hardcoded/Default/Optional behaviors) decomposes. Each constraint migrates to the phase where it applies. "Run with -race for concurrent code" belongs in Phase 3 (RUN), not in a behavior table.
+
+**What stays:** Error Handling, Anti-Patterns, and References remain at the end as context that's consulted when things go wrong — not before the model has understood what "going right" looks like.
+
## Open Sharing Over Individual Ownership
Ideas matter less than open sharing. In an AI-assisted world, provenance becomes invisible. The toolkit is open source because:
diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md
index e0cd0e5..c4b2b5c 100644
--- a/docs/REFERENCE.md
+++ b/docs/REFERENCE.md
@@ -116,7 +116,7 @@ Request deep expertise: *"Use the [name] agent"*
| `technical-documentation-engineer` | Docs, API references |
| `technical-journalist-writer` | Technical journalism |
| `agent-creator-engineer` | Create new agents |
-| `skill-creator-engineer` | Create new skills |
+| `skill-creator` | Create new skills |
| `hook-development-engineer` | Claude Code hooks |
| `project-coordinator-engineer` | Multi-agent orchestration |
| `research-coordinator-engineer` | Research coordination |
diff --git a/docs/for-claude-code.md b/docs/for-claude-code.md
index 0c48efc..a875b22 100644
--- a/docs/for-claude-code.md
+++ b/docs/for-claude-code.md
@@ -439,7 +439,7 @@ Exit 0 = clean. Exit 1 = patterns found.
| Review | reviewer-security, reviewer-business-logic, reviewer-performance, reviewer-concurrency, reviewer-dead-code |
| Data | database-engineer, sqlite-peewee-engineer, data-engineer |
| Content | technical-documentation-engineer, technical-journalist-writer |
-| Meta | skill-creator-engineer, system-upgrade-engineer, pipeline-orchestrator-engineer, research-coordinator-engineer |
+| Meta | skill-creator, system-upgrade-engineer, pipeline-orchestrator-engineer, research-coordinator-engineer |
| Perses | perses-core-engineer, perses-dashboard-engineer, perses-operator-engineer, perses-plugin-engineer |
| UI/Perf | ui-design-engineer, performance-optimization-engineer, react-portfolio-engineer |
| Research | research-coordinator-engineer, research-subagent-executor |
diff --git a/docs/for-developers.md b/docs/for-developers.md
index 9fd3f6c..83e1709 100644
--- a/docs/for-developers.md
+++ b/docs/for-developers.md
@@ -75,7 +75,7 @@ The agent creator uses the `AGENT_TEMPLATE_V2.md` template and produces a comple
/do create a skill for [your workflow]
```
-Describe the methodology, phases, and quality gates. The `skill-creator-engineer` builds the skill directory, SKILL.md with frontmatter, phase definitions, and updates the index.
+Describe the methodology, phases, and quality gates. The `skill-creator` builds the skill directory, SKILL.md with frontmatter, phase definitions, and updates the index.
**Example prompts:**
- `/do create a skill for database migration safety with pre-migration checks, rollback validation, and post-migration verification`
diff --git a/hooks/adr-enforcement.py b/hooks/adr-enforcement.py
index 4f2d567..fb5bccf 100644
--- a/hooks/adr-enforcement.py
+++ b/hooks/adr-enforcement.py
@@ -180,17 +180,8 @@ def main() -> None:
event = json.loads(raw)
- # Only process PostToolUse events
- event_type = event.get("hook_event_name") or event.get("type", "")
- if event_type != _EVENT_NAME:
- empty_output(_EVENT_NAME).print_and_exit(0)
- return
-
- # Only act on Write or Edit tool calls
- tool_name = event.get("tool_name", "")
- if tool_name not in ("Write", "Edit"):
- empty_output(_EVENT_NAME).print_and_exit(0)
- return
+ # tool_name/event_type filters removed — matcher "Write|Edit" in settings.json
+ # prevents this hook from spawning for non-matching tools.
# Extract file path from tool input
tool_input = event.get("tool_input", {})
diff --git a/hooks/agent-grade-on-change.py b/hooks/agent-grade-on-change.py
index 06303c1..4de4084 100644
--- a/hooks/agent-grade-on-change.py
+++ b/hooks/agent-grade-on-change.py
@@ -90,10 +90,8 @@ def main():
if not hook_input:
return
- # Check if this is a relevant tool call
- tool_name = hook_input.get("tool_name", "")
- if tool_name not in ("Edit", "Write"):
- return
+ # tool_name filter removed — matcher "Write|Edit" in settings.json prevents
+ # this hook from spawning for non-matching tools.
# Extract file path from tool input
tool_input_data = hook_input.get("tool_input", {})
diff --git a/hooks/ci-merge-gate.py b/hooks/ci-merge-gate.py
index f2ec425..f29d9ef 100644
--- a/hooks/ci-merge-gate.py
+++ b/hooks/ci-merge-gate.py
@@ -19,9 +19,8 @@
def main() -> None:
data = json.loads(read_stdin(timeout=2))
- tool = data.get("tool_name", "")
- if tool != "Bash":
- return
+ # tool_name filter removed — matcher "Bash" in settings.json prevents
+ # this hook from spawning for non-Bash tools.
command = data.get("tool_input", {}).get("command", "")
diff --git a/hooks/post-tool-lint-hint.py b/hooks/post-tool-lint-hint.py
index 87f9611..f93a012 100755
--- a/hooks/post-tool-lint-hint.py
+++ b/hooks/post-tool-lint-hint.py
@@ -69,14 +69,8 @@ def main():
event_data = read_stdin(timeout=2)
event = json.loads(event_data)
- # Check this is PostToolUse for Write or Edit
- event_type = event.get("hook_event_name") or event.get("type", "")
- if event_type != "PostToolUse":
- return
-
- tool_name = event.get("tool_name", "")
- if tool_name not in ("Write", "Edit"):
- return
+ # tool_name/event_type filters removed — matcher "Write|Edit" in settings.json
+ # prevents this hook from spawning for non-matching tools.
# Get the file path from tool input
tool_input = event.get("tool_input", {})
diff --git a/hooks/posttool-security-scan.py b/hooks/posttool-security-scan.py
index 8270b56..3fd0796 100755
--- a/hooks/posttool-security-scan.py
+++ b/hooks/posttool-security-scan.py
@@ -143,13 +143,8 @@ def main() -> None:
raw = read_stdin(timeout=2)
event = json.loads(raw)
- event_type = event.get("hook_event_name") or event.get("type", "")
- if event_type != "PostToolUse":
- return
-
- tool_name = event.get("tool_name", "")
- if tool_name not in ("Write", "Edit"):
- return
+ # tool_name/event_type filters removed — matcher "Write|Edit" in settings.json
+ # prevents this hook from spawning for non-matching tools.
tool_input = event.get("tool_input", {})
file_path = tool_input.get("file_path", "")
diff --git a/hooks/posttool-session-reads.py b/hooks/posttool-session-reads.py
index f1b2f62..a18400c 100755
--- a/hooks/posttool-session-reads.py
+++ b/hooks/posttool-session-reads.py
@@ -48,10 +48,8 @@ def main() -> None:
event = json.loads(event_data)
- # Only process Read tool results
- tool_name = event.get("tool_name", "")
- if tool_name != "Read":
- return
+ # tool_name filter removed — matcher "Read" in settings.json prevents
+ # this hook from spawning for non-Read tools.
# Extract file_path from tool_input
tool_input = event.get("tool_input", {})
diff --git a/hooks/pretool-adr-creation-gate.py b/hooks/pretool-adr-creation-gate.py
index 075c79a..a1bfd1d 100644
--- a/hooks/pretool-adr-creation-gate.py
+++ b/hooks/pretool-adr-creation-gate.py
@@ -70,10 +70,8 @@ def main() -> None:
except (json.JSONDecodeError, ValueError):
sys.exit(0)
- # Only gate Write — edits to existing files are fine.
- tool_name = event.get("tool_name", "")
- if tool_name != "Write":
- sys.exit(0)
+ # tool_name filter removed — matcher "Write" in settings.json prevents
+ # this hook from spawning for non-Write tools.
# Bypass env var.
if os.environ.get(_BYPASS_ENV) == "1":
diff --git a/hooks/pretool-branch-safety.py b/hooks/pretool-branch-safety.py
index 406dd58..5706a1e 100644
--- a/hooks/pretool-branch-safety.py
+++ b/hooks/pretool-branch-safety.py
@@ -60,9 +60,8 @@ def main() -> None:
except (json.JSONDecodeError, ValueError):
sys.exit(0)
- tool_name = event.get("tool_name", "")
- if tool_name != "Bash":
- sys.exit(0)
+ # tool_name filter removed — matcher "Bash" in settings.json prevents
+ # this hook from spawning for non-Bash tools.
command = event.get("tool_input", {}).get("command", "")
if "git commit" not in command:
diff --git a/hooks/pretool-creation-gate.py b/hooks/pretool-creation-gate.py
index 4d4e506..2b554a4 100644
--- a/hooks/pretool-creation-gate.py
+++ b/hooks/pretool-creation-gate.py
@@ -4,12 +4,12 @@
PreToolUse:Write Hook: Creation Gate
Blocks direct creation of new agent/skill files that bypass the
-skill-creator-engineer pipeline. Forces the LLM to route through
+skill-creator pipeline. Forces the LLM to route through
proper creation workflows that produce full-depth components.
This is a HARD GATE — it physically prevents the Write tool from creating
new agent or skill files. The LLM receives a [fix-with-agent] directive
-telling it to use skill-creator-engineer.
+telling it to use skill-creator.
Detection logic:
- Tool is Write (not Edit — edits to existing files are allowed)
@@ -82,9 +82,9 @@ def main() -> None:
# Block: new agent or skill file being created outside the creator pipeline
component_type = "agent" if is_agent else "skill"
print(
- f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator-engineer or skill-creation-pipeline.\n"
+ f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator or skill-creation-pipeline.\n"
f"[creation-gate] Path: {file_path}\n"
- f"[fix-with-agent] skill-creator-engineer",
+ f"[fix-with-agent] skill-creator",
file=sys.stderr,
)
sys.exit(2)
diff --git a/hooks/pretool-file-backup.py b/hooks/pretool-file-backup.py
index dab630a..9470068 100755
--- a/hooks/pretool-file-backup.py
+++ b/hooks/pretool-file-backup.py
@@ -49,9 +49,8 @@ def main() -> None:
except (json.JSONDecodeError, ValueError):
sys.exit(0)
- tool_name = event.get("tool_name", "")
- if tool_name != "Edit":
- sys.exit(0)
+ # tool_name filter removed — matcher "Edit" in settings.json prevents
+ # this hook from spawning for non-Edit tools.
tool_input = event.get("tool_input", {})
file_path = tool_input.get("file_path", "")
diff --git a/hooks/pretool-learning-injector.py b/hooks/pretool-learning-injector.py
index df5f982..5216335 100755
--- a/hooks/pretool-learning-injector.py
+++ b/hooks/pretool-learning-injector.py
@@ -31,9 +31,6 @@
EVENT_NAME = "PreToolUse"
-# Tools that benefit from proactive learning injection
-TARGET_TOOLS = {"Bash", "Edit"}
-
# Max characters in the injected context to stay lightweight
MAX_CONTEXT_CHARS = 500
@@ -160,11 +157,9 @@ def main():
event = json.loads(event_data)
- # Early exit for non-target tools
+ # tool_name filter removed — matcher "Bash|Edit" in settings.json prevents
+ # this hook from spawning for non-matching tools.
tool_name = event.get("tool_name", "")
- if tool_name not in TARGET_TOOLS:
- empty_output(EVENT_NAME).print_and_exit()
-
tool_input = event.get("tool_input", {})
# Extract tags based on tool type
diff --git a/hooks/pretool-plan-gate.py b/hooks/pretool-plan-gate.py
index 04c7398..2b2fa0b 100644
--- a/hooks/pretool-plan-gate.py
+++ b/hooks/pretool-plan-gate.py
@@ -54,9 +54,8 @@ def main() -> None:
except (json.JSONDecodeError, ValueError):
sys.exit(0)
- tool_name = event.get("tool_name", "")
- if tool_name not in ("Write", "Edit"):
- sys.exit(0)
+ # tool_name filter removed — matcher "Write|Edit" in settings.json prevents
+ # this hook from spawning for non-matching tools.
# Bypass env var — set by the plans skill itself.
if os.environ.get(_BYPASS_ENV) == "1":
diff --git a/hooks/pretool-prompt-injection-scanner.py b/hooks/pretool-prompt-injection-scanner.py
index 88348d6..d3502ae 100644
--- a/hooks/pretool-prompt-injection-scanner.py
+++ b/hooks/pretool-prompt-injection-scanner.py
@@ -268,11 +268,9 @@ def main() -> None:
print(f"[injection-scanner] JSON parse failed: {e}", file=sys.stderr)
empty_output(EVENT_NAME).print_and_exit()
- # Field name compatibility: try new names first, fall back to old
+ # tool_name filter removed — matcher "Write|Edit" in settings.json prevents
+ # this hook from spawning for non-matching tools.
tool = event.get("tool_name") or event.get("tool", "")
- if tool not in ("Write", "Edit"):
- empty_output(EVENT_NAME).print_and_exit()
-
tool_input = event.get("tool_input", event.get("input", {}))
file_path = tool_input.get("file_path", "")
if not file_path:
diff --git a/hooks/pretool-subagent-warmstart.py b/hooks/pretool-subagent-warmstart.py
index 2a1a871..1da4886 100755
--- a/hooks/pretool-subagent-warmstart.py
+++ b/hooks/pretool-subagent-warmstart.py
@@ -251,10 +251,8 @@ def main() -> None:
event = json.loads(event_data)
- # Only process Agent tool invocations
- tool_name = event.get("tool_name", "")
- if tool_name != "Agent":
- return
+ # tool_name filter removed — matcher "Agent" in settings.json prevents
+ # this hook from spawning for non-Agent tools.
# Gather context from various sources
files = load_recent_reads(Path(SESSION_READS_FILE))
diff --git a/hooks/pretool-synthesis-gate.py b/hooks/pretool-synthesis-gate.py
index 086932b..f092066 100755
--- a/hooks/pretool-synthesis-gate.py
+++ b/hooks/pretool-synthesis-gate.py
@@ -123,9 +123,8 @@ def main() -> None:
except (json.JSONDecodeError, ValueError):
sys.exit(0)
- tool_name = event.get("tool_name", "")
- if tool_name not in ("Write", "Edit"):
- sys.exit(0)
+ # tool_name filter removed — matcher "Write|Edit" in settings.json prevents
+ # this hook from spawning for non-matching tools.
# Bypass env var — set by the consultation skill itself.
if os.environ.get(_BYPASS_ENV) == "1":
diff --git a/hooks/pretool-unified-gate.py b/hooks/pretool-unified-gate.py
index 81d6751..79b0cfe 100644
--- a/hooks/pretool-unified-gate.py
+++ b/hooks/pretool-unified-gate.py
@@ -295,9 +295,9 @@ def check_creation_gate(file_path: str) -> None:
component_type = "agent" if is_agent else "skill"
_block(
- f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator-engineer or skill-creation-pipeline.\n"
+ f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator or skill-creation-pipeline.\n"
f"[creation-gate] Path: {file_path}\n"
- f"[fix-with-agent] skill-creator-engineer"
+ f"[fix-with-agent] skill-creator"
)
diff --git a/hooks/record-activation.py b/hooks/record-activation.py
index 9ac1cb3..e52fdfb 100644
--- a/hooks/record-activation.py
+++ b/hooks/record-activation.py
@@ -28,18 +28,14 @@
from hook_utils import get_session_id
from stdin_timeout import read_stdin
-# Tools that represent meaningful work completing successfully
-TRACKED_TOOLS = {"Edit", "Write", "Bash"}
-
def main() -> None:
"""Record session activation stats on successful tool completions."""
try:
hook_input = json.loads(read_stdin(timeout=2))
- tool_name = hook_input.get("tool_name", "")
- if tool_name not in TRACKED_TOOLS:
- return
+ # tool_name filter removed — matcher "Edit|Write|Bash" in settings.json
+ # prevents this hook from spawning for non-matching tools.
tool_result = hook_input.get("tool_result", {})
if tool_result.get("is_error", False):
diff --git a/hooks/retro-graduation-gate.py b/hooks/retro-graduation-gate.py
index 76bcc3c..f7900b7 100644
--- a/hooks/retro-graduation-gate.py
+++ b/hooks/retro-graduation-gate.py
@@ -30,16 +30,8 @@ def main() -> None:
empty_output(EVENT).print_and_exit(0)
return
- # Event type guard (defensive — matches peer hook pattern)
- event_type = data.get("hook_event_name") or data.get("type", "")
- if event_type and event_type != EVENT:
- empty_output(EVENT).print_and_exit(0)
- return
-
- # Early-exit: only care about Bash tool (PostToolUse schema: tool_name)
- if data.get("tool_name") != "Bash":
- empty_output(EVENT).print_and_exit(0)
- return
+ # tool_name/event_type filters removed — matcher "Bash" in settings.json
+ # prevents this hook from spawning for non-Bash tools.
# Early-exit: check if output indicates a PR was created (PostToolUse schema: tool_result.output)
tool_result = data.get("tool_result", {})
diff --git a/hooks/review-capture.py b/hooks/review-capture.py
index 724f1db..9883a06 100644
--- a/hooks/review-capture.py
+++ b/hooks/review-capture.py
@@ -117,10 +117,8 @@ def main() -> None:
event = json.loads(event_data)
- # Only process Agent tool results
- tool_name = event.get("tool_name", "")
- if tool_name != "Agent":
- return
+ # tool_name filter removed — matcher "Agent" in settings.json prevents
+ # this hook from spawning for non-Agent tools.
# Get tool result text
tool_result = event.get("tool_result", "")
diff --git a/hooks/skill-evaluator.py b/hooks/skill-evaluator.py
index d142510..39402d5 100644
--- a/hooks/skill-evaluator.py
+++ b/hooks/skill-evaluator.py
@@ -43,7 +43,7 @@
"testing-automation-engineer": "Unit/E2E tests, Playwright, CI pipelines",
# Meta/Creation
"agent-creator-engineer": "Create new specialized agents",
- "skill-creator-engineer": "Create new Claude skills",
+ "skill-creator": "Create new Claude skills",
"hook-development-engineer": "Create Claude Code hooks, event handlers",
"mcp-local-docs-engineer": "Build MCP servers for documentation",
# Coordination
@@ -151,7 +151,7 @@ def get_evaluation_prompt(complexity: str) -> str:
- Docs: technical-documentation-engineer, technical-journalist-writer
- UI: ui-design-engineer, performance-optimization-engineer
- Testing: testing-automation-engineer
-- Meta: agent-creator-engineer, skill-creator-engineer, hook-development-engineer
+- Meta: agent-creator-engineer, skill-creator, hook-development-engineer
- Research: research-coordinator-engineer, project-coordinator-engineer
- Critique: roast skill (5 personas: contrarian, newcomer, builder, senior, pedant)"""
diff --git a/hooks/tests/test_post_tool_lint.py b/hooks/tests/test_post_tool_lint.py
index 70ae6d3..88102b1 100755
--- a/hooks/tests/test_post_tool_lint.py
+++ b/hooks/tests/test_post_tool_lint.py
@@ -94,7 +94,11 @@ def test_ignores_non_lintable_files():
def test_ignores_read_tool():
- """Hook should only trigger for Write/Edit, not Read."""
+ """Read tool filtering is now handled by matcher 'Write|Edit' in settings.json.
+
+ When called directly (without matcher), the hook processes any tool_name.
+ This test verifies the hook still exits 0 (non-blocking) for any input.
+ """
setup()
event = {
"type": "PostToolUse",
@@ -104,7 +108,7 @@ def test_ignores_read_tool():
stdout, stderr, code = run_hook(event)
assert code == 0
- assert stdout == ""
+ # Note: hook may produce output since tool_name filter was moved to matcher
def test_handles_missing_file_path():
diff --git a/hooks/tests/test_posttool_session_reads.py b/hooks/tests/test_posttool_session_reads.py
index 8e0fc05..6105971 100644
--- a/hooks/tests/test_posttool_session_reads.py
+++ b/hooks/tests/test_posttool_session_reads.py
@@ -51,30 +51,23 @@ def run_hook(event: dict) -> tuple[str, str, int]:
class TestToolNameFiltering:
"""Only Read tool events should be processed."""
- def test_ignores_write_tool(self, tmp_path, monkeypatch):
- """Write tool events should produce no output and no file."""
- monkeypatch.chdir(tmp_path)
- event = {
- "tool_name": "Write",
- "tool_input": {"file_path": "/some/file.py"},
- }
- stdout, stderr, code = run_hook(event)
- assert code == 0
- # No session-reads.txt should be created
- assert not (tmp_path / ".claude" / "session-reads.txt").exists()
+ def test_nonread_tool_exits_zero(self, tmp_path, monkeypatch):
+ """Non-Read tool filtering is now handled by matcher 'Read' in settings.json.
- def test_ignores_edit_tool(self, tmp_path, monkeypatch):
- """Edit tool events should be ignored."""
+ When called directly (without matcher), the hook processes any tool_name.
+ This test verifies the hook still exits 0 (non-blocking) for any input.
+ """
monkeypatch.chdir(tmp_path)
- event = {
- "tool_name": "Edit",
- "tool_input": {"file_path": "/some/file.py"},
- }
- stdout, stderr, code = run_hook(event)
- assert code == 0
+ for tool in ("Write", "Edit", "Bash"):
+ event = {
+ "tool_name": tool,
+ "tool_input": {"file_path": "/some/file.py"} if tool != "Bash" else {"command": "ls"},
+ }
+ stdout, stderr, code = run_hook(event)
+ assert code == 0
def test_ignores_bash_tool(self, tmp_path, monkeypatch):
- """Bash tool events should be ignored."""
+ """Bash tool events should be ignored (no file_path to extract)."""
monkeypatch.chdir(tmp_path)
event = {
"tool_name": "Bash",
diff --git a/hooks/tests/test_pretool_subagent_warmstart.py b/hooks/tests/test_pretool_subagent_warmstart.py
index f8a1b51..62da3c1 100644
--- a/hooks/tests/test_pretool_subagent_warmstart.py
+++ b/hooks/tests/test_pretool_subagent_warmstart.py
@@ -58,28 +58,19 @@ def run_hook(event: dict) -> tuple[str, str, int]:
class TestToolNameFiltering:
"""Only Agent tool events should be processed."""
- def test_ignores_read_tool(self):
- """Read tool events should produce no context output."""
- event = {"tool_name": "Read", "tool_input": {"file_path": "/x"}}
- stdout, stderr, code = run_hook(event)
- assert code == 0
- # Should be empty or empty hook output (no warmstart context)
- if stdout.strip():
- output = json.loads(stdout)
- hook_out = output.get("hookSpecificOutput", {})
- assert "additionalContext" not in hook_out or "[warmstart]" not in hook_out.get("additionalContext", "")
-
- def test_ignores_write_tool(self):
- """Write tool events should be ignored."""
- event = {"tool_name": "Write", "tool_input": {"file_path": "/x"}}
- stdout, stderr, code = run_hook(event)
- assert code == 0
-
- def test_ignores_bash_tool(self):
- """Bash tool events should be ignored."""
- event = {"tool_name": "Bash", "tool_input": {"command": "ls"}}
- stdout, stderr, code = run_hook(event)
- assert code == 0
+ def test_nonagent_tools_exit_zero(self):
+ """Non-Agent tool filtering is now handled by matcher 'Agent' in settings.json.
+
+ When called directly (without matcher), the hook processes any tool_name.
+ This test verifies the hook still exits 0 (non-blocking) for any input.
+ """
+ for tool, tool_input in [
+ ("Read", {"file_path": "/x"}),
+ ("Write", {"file_path": "/x"}),
+ ("Bash", {"command": "ls"}),
+ ]:
+ stdout, stderr, code = run_hook({"tool_name": tool, "tool_input": tool_input})
+ assert code == 0
def test_processes_agent_tool(self, tmp_path, monkeypatch):
"""Agent tool events should produce warmstart context."""
diff --git a/hooks/usage-tracker.py b/hooks/usage-tracker.py
index 6ea3847..73626de 100644
--- a/hooks/usage-tracker.py
+++ b/hooks/usage-tracker.py
@@ -32,17 +32,10 @@ def main():
event = json.loads(event_data)
- # Only process PostToolUse events
- event_type = event.get("hook_event_name") or event.get("type", "")
- if event_type != "PostToolUse":
- return
-
+ # tool_name/event_type filters removed — matcher "Skill|Agent" in settings.json
+ # prevents this hook from spawning for non-matching tools.
tool_name = event.get("tool_name", "")
- # Only track Skill and Agent tools — exit silently for everything else
- if tool_name not in ("Skill", "Agent"):
- return
-
# Lazy import — only loaded when we actually need to record
from hook_utils import get_project_dir, get_session_id
from usage_db import record_agent, record_skill
diff --git a/pipelines/INDEX.json b/pipelines/INDEX.json
index 464d163..13a4e88 100644
--- a/pipelines/INDEX.json
+++ b/pipelines/INDEX.json
@@ -27,7 +27,7 @@
"agent-evaluation",
"system-upgrade"
],
- "agent": "skill-creator-engineer"
+ "agent": "skill-creator"
},
"article-evaluation-pipeline": {
"file": "pipelines/article-evaluation-pipeline/SKILL.md",
@@ -626,7 +626,7 @@
"agent-evaluation",
"routing-table-updater"
],
- "agent": "skill-creator-engineer"
+ "agent": "skill-creator"
},
"system-upgrade": {
"file": "pipelines/system-upgrade/SKILL.md",
diff --git a/pipelines/agent-upgrade/SKILL.md b/pipelines/agent-upgrade/SKILL.md
index 5230821..69032bd 100644
--- a/pipelines/agent-upgrade/SKILL.md
+++ b/pipelines/agent-upgrade/SKILL.md
@@ -10,7 +10,7 @@ description: |
version: 1.0.0
user-invocable: false
argument-hint: ""
-agent: skill-creator-engineer
+agent: skill-creator
allowed-tools:
- Read
- Bash
diff --git a/pipelines/pipeline-scaffolder/references/architecture-rules.md b/pipelines/pipeline-scaffolder/references/architecture-rules.md
index 29afef7..fd34cd1 100644
--- a/pipelines/pipeline-scaffolder/references/architecture-rules.md
+++ b/pipelines/pipeline-scaffolder/references/architecture-rules.md
@@ -84,7 +84,7 @@ Phase 1: DISCOVER (sequential — needs full context)
↓
Phase 2: SCAFFOLD (fan-out — group by creator type)
├─ agent-creator-engineer: Agent A, Agent B, Agent C (1..N)
- ├─ skill-creator-engineer: Skill X, Skill Y (1..M)
+ ├─ skill-creator: Skill X, Skill Y (1..M)
├─ hook-development-engineer: Hook 1, Hook 2 (1..K)
└─ Direct: Script 1, Script 2 (1..J)
↓ (fan-in — wait for all)
diff --git a/pipelines/skill-creation-pipeline/SKILL.md b/pipelines/skill-creation-pipeline/SKILL.md
index 6a1e0f5..f3a37fc 100644
--- a/pipelines/skill-creation-pipeline/SKILL.md
+++ b/pipelines/skill-creation-pipeline/SKILL.md
@@ -8,7 +8,7 @@ description: |
Use for "create skill pipeline", "new skill formal", "skill with gates".
version: 1.0.0
user-invocable: false
-agent: skill-creator-engineer
+agent: skill-creator
allowed-tools:
- Read
- Bash
@@ -38,7 +38,7 @@ routing:
## Operator Context
-This pipeline wraps `skill-creator-engineer` with explicit discovery, design
+This pipeline wraps `skill-creator` with explicit discovery, design
review, and validation gates. It is the **formal path** for creating new skills
— as opposed to ad-hoc creation — and should be used whenever skill quality,
uniqueness, or routing correctness is important. The pipeline does not replace
@@ -187,7 +187,7 @@ DESIGN BRIEF: [skill-name]
==========================
Complexity Tier: [Simple | Medium | Complex | Comprehensive]
-Agent Binding: skill-creator-engineer (default) or [other agent if domain-specific]
+Agent Binding: skill-creator (default) or [other agent if domain-specific]
User-Invocable: [true | false]
Phases:
@@ -323,7 +323,7 @@ Read the current INDEX.json and append an entry for the new skill:
"path": "skills/skill-name/SKILL.md",
"description": "[first line of the frontmatter description]",
"user-invocable": true,
- "agent": "skill-creator-engineer"
+ "agent": "skill-creator"
}
```
diff --git a/pipelines/system-upgrade/SKILL.md b/pipelines/system-upgrade/SKILL.md
index 55f7f9c..7bb5b02 100644
--- a/pipelines/system-upgrade/SKILL.md
+++ b/pipelines/system-upgrade/SKILL.md
@@ -46,7 +46,7 @@ complementing the **bottom-up** retro-knowledge-injector.
### Hardcoded Behaviors (Always Apply)
- **Show Plan Before Implementing**: Phase 3 output (ranked upgrade list) MUST be presented to the user and approved before Phase 4 begins. Never silently execute upgrades.
-- **Reuse Domain Agents**: Phase 4 (IMPLEMENT) dispatches to existing domain agents (skill-creator-engineer, agent-creator-engineer, hook-development-engineer, golang-general-engineer, etc.). The upgrade engineer orchestrates; specialists execute.
+- **Reuse Domain Agents**: Phase 4 (IMPLEMENT) dispatches to existing domain agents (skill-creator, agent-creator-engineer, hook-development-engineer, golang-general-engineer, etc.). The upgrade engineer orchestrates; specialists execute.
- **Parallel Fan-Out**: When 3+ components need the same type of upgrade, dispatch in parallel using multiple Agent tool calls in a single message.
- **Score Delta Required**: Phase 5 (VALIDATE) must produce before/after evaluation delta, not just "looks good." Use `agent-evaluation` skill.
- **Trigger Type Determines Input**: The three trigger types (claude-release, goal-change, retro-driven) require different input parsing in Phase 1.
@@ -202,7 +202,7 @@ IMPORTANT (should fix):
4. skills/go-testing/SKILL.md — Apply new pattern from retro L2 [inject-pattern, ~10min]
MINOR (nice to have):
- 5. agents/skill-creator-engineer.md — Add new frontmatter field docs [upgrade, ~5min]
+ 5. agents/skill-creator.md — Add new frontmatter field docs [upgrade, ~5min]
Total: 5 changes across 5 components
Parallel dispatch: 3 groups (hooks, agents, skills)
@@ -232,10 +232,10 @@ git checkout -b chore/system-upgrade-$(date +%Y-%m-%d)
| Change Domain | Domain Agent |
|--------------|-------------|
| Hook modifications | hook-development-engineer |
-| Agent upgrades | agent-creator-engineer (or skill-creator-engineer for agents) |
-| Skill upgrades | skill-creator-engineer |
+| Agent upgrades | agent-creator-engineer (or skill-creator for agents) |
+| Skill upgrades | skill-creator |
| Routing changes | routing-table-updater |
-| Pattern injection | skill-creator-engineer or direct Edit |
+| Pattern injection | skill-creator or direct Edit |
**Step 2**: Dispatch parallel agents for independent groups. Use a single message with multiple Agent tool calls for changes that don't depend on each other.
@@ -365,7 +365,7 @@ Solution: Manually copy modified files to `~/.claude/` equivalent directories. R
### Anti-Pattern 2: Handling All Changes Directly Instead of Dispatching
**What it looks like**: Making all edits inline rather than routing to domain agents
-**Why wrong**: Domain agents (skill-creator-engineer, hook-development-engineer) know the templates and anti-patterns for their domain
+**Why wrong**: Domain agents (skill-creator, hook-development-engineer) know the templates and anti-patterns for their domain
**Do instead**: Dispatch to domain agents for anything beyond simple pattern injection
### Anti-Pattern 3: Auditing Everything Every Time
diff --git a/scripts/audit-tool-restrictions.py b/scripts/audit-tool-restrictions.py
index 6f5e886..1e0301e 100644
--- a/scripts/audit-tool-restrictions.py
+++ b/scripts/audit-tool-restrictions.py
@@ -131,7 +131,7 @@
"python-openstack-engineer": "code-modifier",
"rabbitmq-messaging-engineer": "code-modifier",
"react-portfolio-engineer": "code-modifier",
- "skill-creator-engineer": "code-modifier",
+ "skill-creator": "code-modifier",
"sqlite-peewee-engineer": "code-modifier",
"testing-automation-engineer": "code-modifier",
"typescript-debugging-engineer": "code-modifier",
diff --git a/scripts/routing-benchmark.json b/scripts/routing-benchmark.json
index f41d1a9..7f80cdc 100644
--- a/scripts/routing-benchmark.json
+++ b/scripts/routing-benchmark.json
@@ -284,10 +284,9 @@
},
{
"request": "create a new Claude Code skill with quality gates",
- "expected_agent": "skill-creator-engineer",
- "expected_skill": "skill-creation-pipeline",
+ "expected_skill": "skill-creator",
"category": "meta-tooling",
- "notes": "Skill creation — agent + pipeline pairing"
+ "notes": "Skill creation — skill-creator handles the full eval-driven workflow"
},
{
"request": "create a new hook for PostToolUse events",
diff --git a/skills/INDEX.json b/skills/INDEX.json
index 385355c..c9bc4de 100644
--- a/skills/INDEX.json
+++ b/skills/INDEX.json
@@ -1,6 +1,6 @@
{
"version": "2.0",
- "generated": "2026-03-25T23:05:47Z",
+ "generated": "2026-03-27T03:14:10Z",
"generated_by": "scripts/generate-skill-index.py",
"skills": {
"adr-consultation": {
@@ -16,7 +16,7 @@
"adr consultation"
],
"category": "meta",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"feature-design",
@@ -121,7 +121,7 @@
"find unused"
],
"category": "code-quality",
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0"
},
"code-linting": {
@@ -206,26 +206,27 @@
},
"content-engine": {
"file": "skills/content-engine/SKILL.md",
- "description": "Repurpose a source asset into platform-native social content variants for X, LinkedIn, TikTok, YouTube, and newsletter. Produces content_ideas.md and content_drafts.md with a quality gate before delivery.",
+ "description": "Repurpose a source asset (article, demo, launch note, insight) into platform-native social content variants.",
"triggers": [
"repurpose this",
"adapt for social",
"turn this into posts",
"content from article",
"content from demo",
+ "content from doc",
"write variants for",
"social content from",
"platform variants",
"repurpose for"
],
"category": "content",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"x-api",
"crosspost"
],
- "disambiguate": "voice-writer"
+ "model": "sonnet"
},
"create-voice": {
"file": "skills/create-voice/SKILL.md",
@@ -241,7 +242,7 @@
],
"category": "content",
"force_route": true,
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"voice-calibrator",
@@ -358,7 +359,7 @@
"10 perspectives"
],
"category": "meta-tooling",
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0"
},
"docs-sync-checker": {
@@ -385,16 +386,15 @@
"POM",
"test flakiness"
],
- "category": "testing",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
- "agent": "testing-automation-engineer",
- "model": "sonnet",
"pairs_with": [
"testing-automation-engineer",
"typescript-frontend-engineer",
"test-driven-development"
- ]
+ ],
+ "agent": "testing-automation-engineer",
+ "model": "sonnet"
},
"endpoint-validator": {
"file": "skills/endpoint-validator/SKILL.md",
@@ -425,7 +425,7 @@
],
"category": "process",
"force_route": true,
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": []
},
@@ -442,7 +442,7 @@
],
"category": "process",
"force_route": true,
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0",
"pairs_with": [
"feature-plan",
@@ -461,7 +461,7 @@
],
"category": "process",
"force_route": true,
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0",
"pairs_with": [
"feature-plan",
@@ -481,7 +481,7 @@
],
"category": "process",
"force_route": true,
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0",
"pairs_with": [
"feature-design",
@@ -501,7 +501,7 @@
],
"category": "process",
"force_route": true,
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0",
"pairs_with": [
"feature-validate",
@@ -521,7 +521,7 @@
],
"category": "process",
"force_route": true,
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0",
"pairs_with": [
"feature-implement",
@@ -553,32 +553,6 @@
"version": "2.0.0",
"pairs_with": []
},
- "frontend-slides": {
- "file": "skills/frontend-slides/SKILL.md",
- "description": "Browser-based HTML presentation generation with viewport-fit enforcement, curated style presets, and deterministic overflow validation. Three paths: new build, PPTX-to-HTML conversion, or HTML deck enhancement.",
- "triggers": [
- "HTML slides",
- "browser presentation",
- "web deck",
- "reveal-style",
- "viewport presentation",
- "convert PPTX to web",
- "convert PPTX to HTML",
- "slides for a browser",
- "kiosk presentation",
- "interactive presentation keyboard",
- "projector browser"
- ],
- "category": "frontend",
- "user_invocable": true,
- "version": "1.0.0",
- "agent": "typescript-frontend-engineer",
- "model": "sonnet",
- "pairs_with": [
- "typescript-frontend-engineer",
- "pptx-generator"
- ]
- },
"forensics": {
"file": "skills/forensics/SKILL.md",
"description": "Post-mortem diagnostic analysis of failed or stuck workflows.",
@@ -597,7 +571,7 @@
"incident review"
],
"category": "process",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"systematic-debugging",
@@ -605,6 +579,32 @@
"plan-checker"
]
},
+ "frontend-slides": {
+ "file": "skills/frontend-slides/SKILL.md",
+ "description": "Browser-based HTML presentation generation with viewport-fit enforcement.",
+ "triggers": [
+ "HTML slides",
+ "browser presentation",
+ "web deck",
+ "reveal-style",
+ "viewport presentation",
+ "convert PPTX to web",
+ "convert PPTX to HTML",
+ "slides for a browser",
+ "kiosk presentation",
+ "interactive presentation keyboard",
+ "projector browser"
+ ],
+ "category": "frontend",
+ "user_invocable": false,
+ "version": "1.0.0",
+ "pairs_with": [
+ "typescript-frontend-engineer",
+ "pptx-generator"
+ ],
+ "agent": "typescript-frontend-engineer",
+ "model": "sonnet"
+ },
"full-repo-review": {
"file": "skills/full-repo-review/SKILL.md",
"description": "Run comprehensive 3-wave review against all source files in the repo, producing a prioritized issue backlog.",
@@ -654,7 +654,7 @@
"make claude.md"
],
"category": "documentation",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"go-sapcc-conventions",
@@ -667,11 +667,7 @@
"triggers": [
"commit",
"stage and commit",
- "commit changes",
- "save my work",
- "commit this",
- "save progress",
- "checkpoint"
+ "commit changes"
],
"category": "git-workflow",
"force_route": true,
@@ -703,7 +699,7 @@
"github inbox"
],
"category": "github",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [],
"model": "sonnet"
@@ -875,7 +871,7 @@
"headless agent"
],
"category": "process",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"agent": "python-general-engineer"
},
@@ -940,7 +936,7 @@
"wiring check"
],
"category": "process",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"feature-implement",
@@ -961,7 +957,7 @@
"reframe positively"
],
"category": "content",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"voice-writer",
@@ -969,6 +965,54 @@
"voice-validator"
]
},
+ "kotlin-coroutines": {
+ "file": "skills/kotlin-coroutines/SKILL.md",
+ "description": "Kotlin structured concurrency, Flow, Channel, and cancellation patterns",
+ "triggers": [
+ "kotlin-coroutines",
+ "kotlin",
+ "coroutines"
+ ],
+ "user_invocable": false,
+ "version": "1.0.0",
+ "agent": "general-purpose"
+ },
+ "kotlin-testing": {
+ "file": "skills/kotlin-testing/SKILL.md",
+ "description": "Kotlin testing patterns with JUnit 5, Kotest, and coroutine test dispatchers",
+ "triggers": [
+ "kotlin-testing",
+ "kotlin",
+ "testing"
+ ],
+ "user_invocable": false,
+ "version": "1.0.0",
+ "agent": "general-purpose"
+ },
+ "kubernetes-debugging": {
+ "file": "skills/kubernetes-debugging/SKILL.md",
+ "description": "Kubernetes debugging methodology for pod failures, networking issues, and resource problems",
+ "triggers": [
+ "kubernetes-debugging",
+ "kubernetes",
+ "debugging"
+ ],
+ "user_invocable": false,
+ "version": "1.0.0",
+ "agent": "kubernetes-helm-engineer"
+ },
+ "kubernetes-security": {
+ "file": "skills/kubernetes-security/SKILL.md",
+ "description": "Kubernetes security patterns including RBAC, PodSecurityStandards, network policies, and secret management",
+ "triggers": [
+ "kubernetes-security",
+ "kubernetes",
+ "security"
+ ],
+ "user_invocable": false,
+ "version": "1.0.0",
+ "agent": "kubernetes-helm-engineer"
+ },
"learn": {
"file": "skills/learn/SKILL.md",
"description": "Manually teach Claude Code an error pattern and its solution, storing it in the learning database with high confidence.",
@@ -978,7 +1022,7 @@
"manual learning entry"
],
"category": "meta-tooling",
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0"
},
"link-auditor": {
@@ -1058,7 +1102,7 @@
"wrap up session"
],
"category": "process",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"resume-work"
@@ -1167,7 +1211,7 @@
"first-time Perses setup"
],
"category": "perses",
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0",
"agent": "perses-dashboard-engineer"
},
@@ -1232,6 +1276,30 @@
"version": "2.0.0",
"agent": "perses-dashboard-engineer"
},
+ "php-quality": {
+ "file": "skills/php-quality/SKILL.md",
+ "description": "PHP code quality patterns including PSR standards, strict types, and framework idioms",
+ "triggers": [
+ "php-quality",
+ "php",
+ "quality"
+ ],
+ "user_invocable": false,
+ "version": "1.0.0",
+ "agent": "general-purpose"
+ },
+ "php-testing": {
+ "file": "skills/php-testing/SKILL.md",
+ "description": "PHP testing patterns with PHPUnit, test doubles, and database testing",
+ "triggers": [
+ "php-testing",
+ "php",
+ "testing"
+ ],
+ "user_invocable": false,
+ "version": "1.0.0",
+ "agent": "general-purpose"
+ },
"plan-checker": {
"file": "skills/plan-checker/SKILL.md",
"description": "Validate plans against 10 verification dimensions before execution begins.",
@@ -1245,7 +1313,7 @@
"pre-execution check"
],
"category": "process",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"feature-plan",
@@ -1301,7 +1369,7 @@
"plant-seed"
],
"category": "process",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"feature-design"
@@ -1352,7 +1420,7 @@
"prune branches"
],
"category": "git-workflow",
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0"
},
"pr-fix": {
@@ -1364,7 +1432,7 @@
"pr-fix"
],
"category": "git-workflow",
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0"
},
"pr-miner": {
@@ -1397,7 +1465,7 @@
"address review comments"
],
"category": "git-workflow",
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0"
},
"pr-status": {
@@ -1410,7 +1478,7 @@
],
"category": "git-workflow",
"force_route": true,
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0"
},
"pr-sync": {
@@ -1420,11 +1488,7 @@
"push",
"push changes",
"create PR",
- "sync to GitHub",
- "open a pull request",
- "make a PR",
- "submit PR",
- "push and PR"
+ "sync to GitHub"
],
"category": "git-workflow",
"force_route": true,
@@ -1445,7 +1509,7 @@
],
"category": "process",
"force_route": true,
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"feature-design",
@@ -1542,7 +1606,7 @@
"Reddit reports"
],
"category": "process",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"agent": "python-general-engineer"
},
@@ -1558,7 +1622,7 @@
"read every file in repo"
],
"category": "analysis",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"explore-pipeline"
@@ -1609,7 +1673,7 @@
"poke holes in this"
],
"category": "analysis",
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0"
},
"routing-table-updater": {
@@ -1635,7 +1699,7 @@
"sapcc standards check"
],
"category": "language",
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0",
"pairs_with": [
"golang-general-engineer",
@@ -1656,7 +1720,7 @@
"review sapcc standards"
],
"category": "language",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"golang-general-engineer",
@@ -1666,6 +1730,29 @@
"agent": "golang-general-engineer",
"model": "opus"
},
+ "security-threat-model": {
+ "file": "skills/security-threat-model/SKILL.md",
+ "description": "Phase-gated security threat model skill.",
+ "triggers": [
+ "threat model",
+ "security audit",
+ "supply chain scan",
+ "deny list",
+ "learning db sanitize",
+ "security posture",
+ "injection scan",
+ "surface scan",
+ "audit hooks",
+ "audit skills"
+ ],
+ "category": "security",
+ "user_invocable": false,
+ "version": "1.0.0",
+ "pairs_with": [
+ "python-general-engineer"
+ ],
+ "model": "opus"
+ },
"seo-optimizer": {
"file": "skills/seo-optimizer/SKILL.md",
"description": "Analyze and optimize blog post SEO: keywords, titles, meta descriptions, headers, and internal linking.",
@@ -1716,6 +1803,27 @@
"user_invocable": false,
"version": "2.0.0"
},
+ "skill-creator": {
+ "file": "skills/skill-creator/SKILL.md",
+ "description": "Create new skills and iteratively improve them through eval-driven validation.",
+ "triggers": [
+ "create skill",
+ "new skill",
+ "skill template",
+ "skill design",
+ "test skill",
+ "improve skill",
+ "optimize description",
+ "skill eval"
+ ],
+ "category": "meta",
+ "user_invocable": false,
+ "version": "2.0.0",
+ "pairs_with": [
+ "agent-evaluation",
+ "verification-before-completion"
+ ]
+ },
"skill-eval": {
"file": "skills/skill-eval/SKILL.md",
"description": "Evaluate and improve skills through measured testing.",
@@ -1729,13 +1837,13 @@
"skill quality"
],
"category": "meta",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"agent-evaluation",
"verification-before-completion"
],
- "agent": "skill-creator-engineer"
+ "agent": "skill-creator"
},
"socratic-debugging": {
"file": "skills/socratic-debugging/SKILL.md",
@@ -1788,6 +1896,30 @@
"user_invocable": false,
"version": "2.0.0"
},
+ "swift-concurrency": {
+ "file": "skills/swift-concurrency/SKILL.md",
+ "description": "Swift structured concurrency with async/await, Actor, Task, and Sendable patterns",
+ "triggers": [
+ "swift-concurrency",
+ "swift",
+ "concurrency"
+ ],
+ "user_invocable": false,
+ "version": "1.0.0",
+ "agent": "general-purpose"
+ },
+ "swift-testing": {
+ "file": "skills/swift-testing/SKILL.md",
+ "description": "Swift testing patterns with XCTest, Swift Testing framework, and async test patterns",
+ "triggers": [
+ "swift-testing",
+ "swift",
+ "testing"
+ ],
+ "user_invocable": false,
+ "version": "1.0.0",
+ "agent": "general-purpose"
+ },
"systematic-code-review": {
"file": "skills/systematic-code-review/SKILL.md",
"description": "4-phase code review methodology: UNDERSTAND changes, VERIFY claims against code, ASSESS security/performance/architecture risks, DOCUMENT findings with severity classification.",
@@ -1984,12 +2116,12 @@
"assemble clips",
"video editing"
],
- "category": "media",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"typescript-frontend-engineer"
],
+ "agent": "python-general-engineer",
"model": "sonnet"
},
"vitest-runner": {
@@ -2037,7 +2169,7 @@
"strict verification"
],
"category": "process",
- "user_invocable": true,
+ "user_invocable": false,
"version": "2.0.0"
},
"wordpress-live-validation": {
@@ -2099,7 +2231,7 @@
},
"x-api": {
"file": "skills/x-api/SKILL.md",
- "description": "Post tweets, build threads, upload media, and read timelines via the X API with OAuth 1.0a/2.0 and a mandatory confirm gate before any write.",
+ "description": "Post tweets, build threads, upload media, and read timelines via the X API.",
"triggers": [
"post to X",
"post tweet",
@@ -2116,12 +2248,14 @@
"publish to twitter"
],
"category": "content-publishing",
- "user_invocable": true,
+ "user_invocable": false,
"version": "1.0.0",
"pairs_with": [
"content-engine",
"crosspost"
- ]
+ ],
+ "agent": "python-general-engineer",
+ "model": "sonnet"
}
}
}
diff --git a/skills/agent-evaluation/SKILL.md b/skills/agent-evaluation/SKILL.md
index 761fde1..2534f9e 100644
--- a/skills/agent-evaluation/SKILL.md
+++ b/skills/agent-evaluation/SKILL.md
@@ -60,7 +60,7 @@ This skill operates as an operator for agent/skill quality assurance, configurin
- Batch-evaluate entire collections with summary statistics
## What This Skill CANNOT Do
-- Modify or fix agents/skills (use skill-creator-engineer instead)
+- Modify or fix agents/skills (use skill-creator instead)
- Evaluate external repositories or non-agent/skill files
- Replace human judgment on content accuracy or domain correctness
- Skip rubric categories — all must be scored
diff --git a/skills/do/references/routing-tables.md b/skills/do/references/routing-tables.md
index 83866ae..9063415 100644
--- a/skills/do/references/routing-tables.md
+++ b/skills/do/references/routing-tables.md
@@ -33,7 +33,7 @@ Route to these agents based on the user's task domain. Each entry describes what
| **project-coordinator-engineer** | User needs multi-agent coordination for a large project: spawning parallel agents, tracking cross-cutting tasks, or orchestrating a multi-phase effort. |
| **pipeline-orchestrator-engineer** | User wants to create a new pipeline, scaffold a new structured workflow, or compose pipeline phases. |
| **hook-development-engineer** | User wants to create or modify Python hooks for Claude Code's event-driven system (SessionStart, PostToolUse, etc.). |
-| **skill-creator-engineer** | User wants to create or improve a Claude Code skill, workflow automation, or agent configuration. |
+| **skill-creator** | User wants to create or improve a Claude Code skill, workflow automation, or agent configuration. |
| **system-upgrade-engineer** | User wants to upgrade the agent/skill/hook ecosystem after a Claude model update or system-wide change. |
| **technical-documentation-engineer** | User needs technical documentation created, maintained, or validated — API docs, READMEs, architecture guides. |
| **technical-journalist-writer** | User needs professional technical writing in a journalism style — articles, posts, or content with a specific authored voice. |
@@ -46,7 +46,7 @@ Route to these agents based on the user's task domain. Each entry describes what
| **github-profile-rules-engineer** | User wants to extract coding conventions, programming rules, or style guidelines from a GitHub profile's repositories. |
| **react-portfolio-engineer** | User is building a React portfolio or gallery website, typically for creative professionals. |
| **nextjs-ecommerce-engineer** | User is building an e-commerce site with Next.js: product pages, cart, checkout flows. |
-| **toolkit-governance-engineer** | User wants to maintain or modify the toolkit's own internal structure: editing skill/agent files, updating routing tables, managing ADRs, regenerating INDEX.json, or enforcing frontmatter compliance. NOT: creating brand-new agents (use skill-creator-engineer), writing application code (domain agents), or reviewing external PRs (reviewer agents). |
+| **toolkit-governance-engineer** | User wants to maintain or modify the toolkit's own internal structure: editing skill/agent files, updating routing tables, managing ADRs, regenerating INDEX.json, or enforcing frontmatter compliance. NOT: creating brand-new agents (use skill-creator), writing application code (domain agents), or reviewing external PRs (reviewer agents). |
---
@@ -229,10 +229,10 @@ All pipelines live in the `pipelines/` directory (synced to `~/.claude/skills/`
|----------|--------------------|--------|
| **pipeline-scaffolder** (pipeline-orchestrator-engineer) | User wants to create a new pipeline, scaffold a new structured workflow from a spec. | LOAD → SCAFFOLD → INTEGRATE → REPORT |
| **system-upgrade** (system-upgrade-engineer) | User wants to upgrade the Claude Code toolkit after a model update, apply system-wide changes, or roll out agent improvements. NOT: upgrading a specific library dependency in user code. | CHANGELOG → AUDIT → PLAN → IMPLEMENT → VALIDATE → DEPLOY |
-| **skill-creation-pipeline** (skill-creator-engineer) | User wants to create a new skill with formal quality gates, phase structure, and integration. | DISCOVER → DESIGN → SCAFFOLD → VALIDATE → INTEGRATE |
+| **skill-creation-pipeline** (skill-creator) | User wants to create a new skill with formal quality gates, phase structure, and integration. | DISCOVER → DESIGN → SCAFFOLD → VALIDATE → INTEGRATE |
| **hook-development-pipeline** (hook-development-engineer) | User wants to create a new hook with formal spec, performance testing, and registration. | SPEC → IMPLEMENT → TEST → REGISTER → DOCUMENT |
| **research-pipeline** (research-coordinator-engineer) | User wants formal research with saved artifacts, multiple sources, and a synthesized deliverable. NOT: a quick lookup or single-source check. | SCOPE → GATHER → SYNTHESIZE → VALIDATE → DELIVER |
-| **agent-upgrade** (skill-creator-engineer) | User wants to audit and improve a specific agent to bring it up to current template standards. | AUDIT → DIFF → PLAN → IMPLEMENT → RE-EVALUATE |
+| **agent-upgrade** (skill-creator) | User wants to audit and improve a specific agent to bring it up to current template standards. | AUDIT → DIFF → PLAN → IMPLEMENT → RE-EVALUATE |
| **research-to-article** | User wants to research a topic and turn the findings into a written article. | RESEARCH → COMPILE → GROUND → GENERATE → VALIDATE → REFINE → OUTPUT |
| **doc-pipeline** | User wants to generate documentation for a codebase, create a README, or write technical docs from scratch. | RESEARCH → OUTLINE → GENERATE → VERIFY → OUTPUT |
| **pr-pipeline** | User wants the full structured PR workflow with review gates. | CLASSIFY → STAGE → REVIEW → COMMIT → PUSH → CREATE → VERIFY → CLEANUP |
@@ -376,10 +376,10 @@ Invoked via the roast skill or directly:
| "research then write article" | research-to-article pipeline | Research-backed content creation |
| "create a pipeline for X" | pipeline-orchestrator-engineer + pipeline-scaffolder | Pipeline creation |
| "upgrade system for new Claude version" | system-upgrade-engineer + system-upgrade | System-wide upgrade |
-| "create skill with quality gates" | skill-creator-engineer + skill-creation-pipeline | Formal skill creation |
+| "create skill with quality gates" | skill-creator + skill-creation-pipeline | Formal skill creation |
| "create hook (formal, with perf test)" | hook-development-engineer + hook-development-pipeline | Formal hook creation |
| "research with saved artifacts" | research-coordinator-engineer + research-pipeline | Formal research pipeline |
-| "upgrade this specific agent" | skill-creator-engineer + agent-upgrade | Single agent improvement |
+| "upgrade this specific agent" | skill-creator + agent-upgrade | Single agent improvement |
| "create a 3D scene" | typescript-frontend-engineer + threejs-builder | Frontend domain, 3D task |
| "generate image with Python" | python-general-engineer + gemini-image-generator | Python domain, image generation |
| "extract coding rules from github user X" | github-profile-rules-engineer + github-profile-rules | Profile analysis |
diff --git a/skills/routing-table-updater/SKILL.md b/skills/routing-table-updater/SKILL.md
index f55973e..0a51a76 100644
--- a/skills/routing-table-updater/SKILL.md
+++ b/skills/routing-table-updater/SKILL.md
@@ -3,7 +3,7 @@ name: routing-table-updater
description: |
Maintain /do routing tables and command references when skills or agents
are added, modified, or removed. Use when skill/agent metadata changes,
- after skill-creator-engineer or agent-creator-engineer runs, or when
+ after skill-creator or agent-creator-engineer runs, or when
routing tables need synchronization. Use for "update routes", "sync
routing", "routing table", or "refresh /do". Do NOT use for creating
new skills/agents, modifying skill logic, or manual /do table edits.
@@ -262,7 +262,7 @@ If gate fails:
### Example 1: New Skill Created
-User creates `skills/api-integration-helper/SKILL.md` via skill-creator-engineer:
+User creates `skills/api-integration-helper/SKILL.md` via skill-creator:
```yaml
---
@@ -375,7 +375,7 @@ The scaffolder provides a component list (from the Pipeline Spec):
| Scan | All skills/* and agents/* | Only listed components |
| Conflict check | Against existing entries | Against existing AND within batch |
| OUTPUT | One entry at a time | N entries in one pass |
-| Invoked by | skill-creator-engineer, agent-creator-engineer | pipeline-scaffolder Phase 4 |
+| Invoked by | skill-creator, agent-creator-engineer | pipeline-scaffolder Phase 4 |
---
@@ -383,7 +383,7 @@ The scaffolder provides a component list (from the Pipeline Spec):
This skill is typically invoked after other creation skills complete:
-- **After skill-creator-engineer**: New skill created, routing tables need updated entry
+- **After skill-creator**: New skill created, routing tables need updated entry
- **After agent-creator-engineer**: New agent created, domain routing needs expansion
- **After skill/agent modification**: Description or trigger changes require routing refresh
- **During repository maintenance**: Periodic sync to catch manual drift
diff --git a/skills/shared-patterns/pipeline-architecture.md b/skills/shared-patterns/pipeline-architecture.md
index f182873..2f71830 100644
--- a/skills/shared-patterns/pipeline-architecture.md
+++ b/skills/shared-patterns/pipeline-architecture.md
@@ -215,7 +215,7 @@ Define Requirements
Add to Routing
```
-**Skill**: `agent-creator-engineer` or `skill-creator-engineer`
+**Skill**: `agent-creator-engineer` or `skill-creator`
---
diff --git a/skills/skill-creator/SKILL.md b/skills/skill-creator/SKILL.md
new file mode 100644
index 0000000..7bab347
--- /dev/null
+++ b/skills/skill-creator/SKILL.md
@@ -0,0 +1,390 @@
+---
+name: skill-creator
+description: |
+ Create new skills and iteratively improve them through eval-driven validation.
+ Draft a skill, test it against real prompts, review the outputs, improve based
+ on measured results, repeat. Use when creating new skills, improving existing
+ skills, testing skill quality, or optimizing descriptions for triggering accuracy.
+ Use for "create skill", "new skill", "test skill", "improve skill", "optimize
+ description", "skill eval", "turn this into a skill". Do NOT use for agent
+ creation (use agent-creator-engineer) or hook development (use
+ hook-development-engineer).
+version: 2.0.0
+routing:
+ triggers:
+ - create skill
+ - new skill
+ - skill template
+ - skill design
+ - test skill
+ - improve skill
+ - optimize description
+ - skill eval
+ pairs_with:
+ - agent-evaluation
+ - verification-before-completion
+ complexity: Complex
+ category: meta
+allowed-tools:
+ - Read
+ - Edit
+ - Write
+ - Bash
+ - Glob
+ - Grep
+ - Agent
+---
+
+# Skill Creator
+
+Create skills and iteratively improve them through measurement.
+
+The process:
+
+- Decide what the skill should do and how it should work
+- Write a draft of the skill
+- Create test prompts and run claude-with-the-skill on them
+- Evaluate the results — both with agent reviewers and optionally human review
+- Improve the skill based on what the evaluation reveals
+- Repeat until the skill demonstrably helps
+
+Figure out where the user is in this process and help them progress. If they say
+"I want to make a skill for X", help narrow scope, write a draft, write test cases,
+and run the eval loop. If they already have a draft, go straight to testing.
+
+---
+
+## Creating a skill
+
+### Capture intent
+
+Start by understanding what the user wants. The current conversation might already
+contain a workflow worth capturing ("turn this into a skill"). If so, extract:
+
+1. What should this skill enable Claude to do?
+2. When should this skill trigger? (what user phrases, what contexts)
+3. What is the expected output?
+4. Are the outputs objectively verifiable (code, data transforms, structured files)
+ or subjective (writing quality, design aesthetics)? Objectively verifiable outputs
+ benefit from test cases. Subjective outputs are better evaluated by human review.
+
+### Research
+
+Check for existing skills that overlap — run `grep -r "trigger-keyword" skills/*/SKILL.md`
+to avoid duplicating what already exists. If a similar skill exists, offer to improve
+it rather than create a new one.
+
+Read the repository CLAUDE.md before writing anything. Project conventions override
+default patterns.
+
+### Write the SKILL.md
+
+Based on the user interview, create the skill directory and write the SKILL.md.
+
+**Skill structure:**
+
+```
+skill-name/
+├── SKILL.md # Required — the workflow
+├── scripts/ # Deterministic CLI tools the skill invokes
+├── agents/ # Subagent prompts used only by this skill
+├── references/ # Deep context loaded on demand
+└── assets/ # Templates, viewers, static files
+```
+
+**Frontmatter** — name, description, routing metadata:
+
+```yaml
+---
+name: skill-slug-name
+description: |
+ [What it does — 1-2 sentences]. Use when [trigger conditions].
+ Use for "[phrase 1]", "[phrase 2]". Do NOT use for [exclusions].
+version: 1.0.0
+routing:
+ triggers:
+ - keyword1
+ - keyword2
+ pairs_with:
+ - related-skill
+ complexity: Simple | Medium | Complex
+ category: language | infrastructure | review | meta | content
+allowed-tools:
+ - Read
+ - Write
+ - Bash
+---
+```
+
+The description is the primary triggering mechanism. Claude tends to undertrigger
+skills — not activating them when they would help. Combat this by being explicit
+about trigger contexts. Include "Use for" with concrete phrases users would say.
+
+**Body** — workflow first, then context:
+
+1. Brief overview (2-3 sentences: what this does and how)
+2. Instructions / workflow phases (the actual methodology)
+3. Reference material (commands, guides, schemas)
+4. Error handling (cause/solution pairs for common failures)
+5. References to bundled files
+
+Constraints belong inline within the workflow step where they apply, not in a
+separate section. If a constraint matters during Phase 2, put it in Phase 2 —
+not in a preamble the model reads 200 lines before it reaches Phase 2.
+
+Explain the reasoning behind constraints rather than issuing bare imperatives.
+"Run with `-race` because race conditions are silent until production" is more
+effective than "ALWAYS run with -race" because the model can generalize the
+reasoning to situations the skill author didn't anticipate.
+
+**Progressive disclosure** — keep SKILL.md navigable:
+- Summary in frontmatter, workflow in body, deep reference in `references/`
+- If SKILL.md exceeds ~500 lines, move detailed catalogs to reference files
+- Reference files clearly linked from SKILL.md with guidance on when to read them
+
+### Bundled scripts
+
+Extract deterministic, repeatable operations into `scripts/*.py` CLI tools with
+argparse interfaces. Scripts save tokens (the model doesn't reinvent the wheel
+each invocation), ensure consistency across runs, and can be tested independently.
+
+Pattern: `scripts/` for deterministic ops, SKILL.md for LLM-orchestrated workflow.
+
+### Bundled agents
+
+For skills that spawn subagents with specialized roles, bundle agent prompts in
+`agents/`. These are not registered in the routing system — they are internal to
+the skill's workflow.
+
+| Scenario | Approach |
+|----------|----------|
+| Agent used only by this skill | Bundle in `agents/` |
+| Agent shared across skills | Keep in repo `agents/` directory |
+| Agent needs routing metadata | Keep in repo `agents/` directory |
+
+---
+
+## Testing the skill
+
+This is the core of the eval loop. Do not stop after writing — test the skill
+against real prompts and measure whether it actually helps.
+
+### Create test prompts
+
+Write 2-3 realistic test prompts — the kind of thing a real user would say. Rich,
+detailed, specific. Not abstract one-liners.
+
+Bad: `"Format this data"`
+Good: `"I have a CSV in ~/downloads/q4-sales.csv with revenue in column C and costs
+in column D. Add a profit margin percentage column and highlight rows where margin
+is below 10%."`
+
+Share prompts with the user for review before running them.
+
+Save test cases to `evals/evals.json` in the workspace (not in the skill directory —
+eval data is ephemeral):
+
+```json
+{
+ "skill_name": "example-skill",
+ "evals": [
+ {
+ "id": 1,
+ "name": "descriptive-name",
+ "prompt": "The realistic user prompt",
+ "assertions": []
+ }
+ ]
+}
+```
+
+### Run test prompts
+
+For each test case, spawn two subagents in the same turn — one with the skill
+loaded, one without (baseline). Launch everything at once so it finishes together.
+
+**With-skill run:** Tell the subagent to read the skill's SKILL.md first, then
+execute the task. Save outputs to the workspace.
+
+**Baseline run:** Same prompt, no skill loaded. Save to a separate directory.
+
+Organize results by iteration:
+
+```
+skill-workspace/
+├── evals/evals.json
+├── iteration-1/
+│ ├── eval-descriptive-name/
+│ │ ├── with_skill/outputs/
+│ │ ├── without_skill/outputs/
+│ │ └── grading.json
+│ └── benchmark.json
+└── iteration-2/
+ └── ...
+```
+
+### Evaluate results
+
+Evaluation has three tiers, applied in order:
+
+**Tier 1: Deterministic checks** — run automatically where applicable:
+- Does the code compile? (`go build`, `tsc --noEmit`, `python -m py_compile`)
+- Do tests pass? (`go test -race`, `pytest`, `vitest`)
+- Does the linter pass? (`go vet`, `ruff`, `biome`)
+
+**Tier 2: Agent blind review** — dispatch using `agents/comparator.md`:
+- Comparator receives both outputs labeled "Output 1" / "Output 2"
+- It does NOT know which is the skill version
+- Scores on relevant dimensions, picks a winner with reasoning
+- Save results to `blind_comparison.json`
+
+**Tier 3: Human review (optional)** — generate the comparison viewer:
+```bash
+python3 scripts/eval_compare.py path/to/workspace
+open path/to/workspace/compare_report.html
+```
+
+The viewer shows outputs side by side with blind labels, agent review panels,
+deterministic check results, winner picker, feedback textarea, and a
+skip-to-results option. Human reviews are optional — agent reviews are sufficient
+for iteration.
+
+### Draft assertions
+
+While test runs are in progress, draft quantitative assertions for objective
+criteria. Good assertions are discriminating — they fail when the skill doesn't
+help and pass when it does. Non-discriminating assertions ("file exists") provide
+false confidence.
+
+Run the grader (`agents/grader.md`) to evaluate assertions against outputs:
+- PASS requires genuine substance, not surface compliance
+- The grader also critiques the assertions themselves — flagging ones that would
+ pass regardless of skill quality
+
+Aggregate results with `scripts/aggregate_benchmark.py` to get pass rates,
+timing, and token usage with mean/stddev across runs.
+
+---
+
+## Improving the skill
+
+This is the iterative heart of the process.
+
+**Generalize from feedback.** Skills will be used across many prompts, not just
+test cases. If a fix only helps the test case but wouldn't generalize, it's
+overfitting. Try different approaches rather than fiddly adjustments.
+
+**Keep instructions lean.** Read the execution transcripts, not just the final
+outputs. If the skill causes the model to waste time on unproductive work, remove
+those instructions. Instructions that don't pull their weight hurt more than they
+help — they consume attention budget without producing value.
+
+**Explain the reasoning.** Motivation-based instructions generalize better than
+rigid imperatives. "Prefer table-driven tests because they make adding cases
+trivial and the input-output relationship explicit" works better than "MUST use
+table-driven tests" because the model understands when the pattern applies and
+when it doesn't.
+
+**Extract repeated work.** Read the transcripts from test runs. If all subagents
+independently wrote similar helper scripts or took the same multi-step approach,
+bundle that script in `scripts/`. One shared implementation beats N independent
+reinventions.
+
+### The iteration loop
+
+1. Apply improvements to the skill
+2. Rerun all test cases into `iteration-/`, including baselines
+3. Generate the comparison viewer with `--previous-workspace` pointing at the
+ prior iteration
+4. Review — agent or human
+5. Repeat until results plateau or the user is satisfied
+
+Stop iterating when:
+- Feedback is empty (outputs look good)
+- Pass rates aren't improving between iterations
+- The user says they're satisfied
+
+---
+
+## Description optimization
+
+The description field determines whether Claude activates the skill. After the
+skill is working well, optimize the description for triggering accuracy.
+
+Generate 20 eval queries — 10 that should trigger, 10 that should not. The
+should-not queries are the most important: they should be near-misses from
+adjacent domains, not obviously irrelevant queries.
+
+Run the optimization loop:
+```bash
+python3 scripts/optimize_description.py \
+ --skill-path path/to/skill \
+ --eval-set evals/trigger-eval.json \
+ --max-iterations 5
+```
+
+This splits queries 60/40 train/test, evaluates the current description (3 runs
+per query for reliability), proposes improvements based on failures, and selects
+the best description by test-set score to avoid overfitting.
+
+---
+
+## Bundled agents
+
+The `agents/` directory contains prompts for specialized subagents used by this
+skill. Read them when you need to spawn the relevant subagent.
+
+- `agents/grader.md` — Evaluate assertions against outputs with cited evidence
+- `agents/comparator.md` — Blind A/B comparison of two outputs
+- `agents/analyzer.md` — Post-hoc analysis of why one version beat another
+
+---
+
+## Bundled scripts
+
+- `scripts/run_eval.py` — Execute a skill against a test prompt via `claude -p`
+- `scripts/aggregate_benchmark.py` — Compute pass rate statistics across runs
+- `scripts/optimize_description.py` — Train/test description optimization loop
+- `scripts/package_results.py` — Consolidate iteration artifacts into a report
+- `scripts/eval_compare.py` — Generate blind comparison HTML viewer
+
+---
+
+## Reference files
+
+- `references/artifact-schemas.md` — JSON schemas for eval artifacts (evals.json,
+ grading.json, benchmark.json, comparison.json, timing.json, metrics.json)
+- `references/skill-template.md` — Complete SKILL.md template with all sections
+- `references/complexity-tiers.md` — Skill examples by complexity tier
+- `references/workflow-patterns.md` — Reusable phase structures and gate patterns
+- `references/error-catalog.md` — Common skill creation errors with solutions
+
+---
+
+## Error handling
+
+### Skill doesn't trigger when it should
+Cause: Description is too vague or missing trigger phrases
+Solution: Add explicit "Use for" phrases matching what users actually say.
+Test with `scripts/optimize_description.py`.
+
+### Test run produces empty output
+Cause: The `claude -p` subprocess didn't load the skill, or the skill path is wrong
+Solution: Verify the skill directory contains SKILL.md (exact case). Check
+the `--skill-path` argument points to the directory, not the file.
+
+### Grading results show all-pass regardless of skill
+Cause: Assertions are non-discriminating (e.g., "file exists")
+Solution: Write assertions that test behavior, not structure. The grader's
+eval critique section flags these — read it.
+
+### Iteration loop doesn't converge
+Cause: Changes are overfitting to test cases rather than improving the skill
+Solution: Expand the test set with more diverse prompts. Focus improvements
+on understanding WHY outputs differ, not on patching specific failures.
+
+### Description optimization overfits to train set
+Cause: Test set is too small or train/test queries are too similar
+Solution: Ensure should-trigger and should-not-trigger queries are realistic
+near-misses, not obviously different. The 60/40 split guards against this,
+but only if the queries are well-designed.
diff --git a/skills/skill-creator/agents/analyzer.md b/skills/skill-creator/agents/analyzer.md
new file mode 100644
index 0000000..e4665e2
--- /dev/null
+++ b/skills/skill-creator/agents/analyzer.md
@@ -0,0 +1,109 @@
+# Analyzer Agent
+
+You are a post-hoc analysis agent for eval pipelines. You operate after unblinding —
+you know which output was produced with the skill and which without. Your role is to
+produce actionable improvement suggestions based on the full picture of evidence.
+
+## Modes
+
+You operate in one of two modes, specified in the input:
+
+### Mode: comparison
+
+**When to use**: After a single eval's blind comparison has been completed and unblinded.
+
+**Inputs**:
+- `comparison_json`: Path to comparison.json from the comparator agent
+- `skill_a_path` or `skill_b_path`: Which label (A or B) corresponds to with_skill
+- `with_skill_transcript`: Path to with_skill/transcript.md
+- `without_skill_transcript`: Path to without_skill/transcript.md
+- `with_skill_outputs_dir`: Path to with_skill/outputs/
+- `without_skill_outputs_dir`: Path to without_skill/outputs/
+
+**Analysis tasks**:
+1. Identify WHY the winner won (specific criterion advantages)
+2. Identify WHERE the loser can improve (specific, actionable suggestions)
+3. If the skill won: identify what instructions produced the winning behavior so they
+ can be strengthened
+4. If the skill lost: identify which instructions caused harm or were simply ineffective
+5. Check if the skill caused unnecessary work in the transcript (unproductive loops,
+ redundant steps, ignored instructions)
+
+### Mode: benchmark
+
+**When to use**: After an iteration's full benchmark has been computed.
+
+**Inputs**:
+- `benchmark_json`: Path to iteration's benchmark.json
+- `all_grading_jsons`: List of paths to all grading.json files in the iteration
+- `all_comparison_jsons`: List of paths to all comparison.json files in the iteration
+
+**Analysis tasks**:
+1. Identify patterns across all evals (which assertion types consistently fail?)
+2. Flag non-discriminating assertions that appeared in multiple evals
+3. Identify high-variance evals (comparator score spreads, grading inconsistencies)
+4. Surface metric outliers (evals with unusually high token cost or duration)
+5. Produce 3-5 prioritized improvement suggestions for the skill
+
+## Output
+
+Produce a JSON file named `analysis.json` with exactly this structure:
+
+```json
+{
+ "mode": "comparison | benchmark",
+ "timestamp": "ISO 8601 timestamp",
+ "skill_won": "boolean — true if with_skill won (comparison mode) or pass_rate delta > 0 (benchmark mode)",
+ "findings": [
+ {
+ "category": "winner_factors | loser_improvements | instruction_analysis | transcript_waste | assertion_quality | metric_outliers | variance",
+ "priority": "high | medium | low",
+ "finding": "specific observation with cited evidence",
+ "actionable_suggestion": "concrete change to make to the skill or eval"
+ }
+ ],
+ "improvements_for_skill": [
+ {
+ "target": "which section/instruction to change",
+ "current_behavior": "what the skill currently does",
+ "desired_behavior": "what it should do instead",
+ "rationale": "why this change would improve results",
+ "generalization_risk": "low | medium | high — risk of overfitting this change to test cases"
+ }
+ ],
+ "improvements_for_evals": [
+ {
+ "assertion": "the assertion to improve or replace",
+ "problem": "why this assertion is weak or non-discriminating",
+ "replacement": "suggested replacement assertion text"
+ }
+ ],
+ "benchmark_summary": {
+ "with_skill_pass_rate_mean": "float — benchmark mode only",
+ "without_skill_pass_rate_mean": "float — benchmark mode only",
+ "delta": "float — with_skill minus without_skill",
+ "comparator_win_rate": "float — fraction of evals where skill won",
+ "top_failure_categories": ["list of assertion categories that frequently fail"]
+ },
+ "analyzer_notes": "optional string — observations that do not fit the structured fields"
+}
+```
+
+The schema is a contract. Field names, types, and nesting must match exactly. The
+`package_results.py` script reads `findings`, `improvements_for_skill`, and
+`benchmark_summary` by field name.
+
+## Behavior Rules
+
+- Every finding must cite specific evidence. "The skill seems to help" is not a finding.
+ "The skill produced a YAML frontmatter with 7 required fields; without-skill produced
+ 3" is a finding.
+- `generalization_risk` is mandatory for every improvement_for_skill entry. High risk
+ means the change would only help on the specific test case and would likely confuse
+ the model on unseen prompts.
+- In benchmark mode, if `delta` is near zero (within 0.05), investigate whether the
+ assertions are non-discriminating before concluding the skill is ineffective.
+- Prioritize `improvements_for_skill` by expected impact. High priority means the change
+ would plausibly improve pass rate by more than 10 percentage points.
+- Do not suggest adding more instructions as a default. If the skill is not helping,
+ removing instructions (reducing noise) is often more effective than adding them.
diff --git a/skills/skill-creator/agents/comparator.md b/skills/skill-creator/agents/comparator.md
new file mode 100644
index 0000000..9ff7361
--- /dev/null
+++ b/skills/skill-creator/agents/comparator.md
@@ -0,0 +1,118 @@
+# Comparator Agent
+
+You are a blind A/B comparison agent for eval pipelines. You receive two sets of execution
+outputs labeled A and B. You do not know which skill produced which output. Your role is
+to produce a scored comparison without knowing the answer — this prevents confirmation bias
+from affecting the verdict.
+
+## Inputs
+
+You will receive:
+- `output_a_dir`: Path to the first execution's outputs directory
+- `output_b_dir`: Path to the second execution's outputs directory
+- `transcript_a`: Path to the first execution's transcript.md
+- `transcript_b`: Path to the second execution's transcript.md
+- `assertions` (optional): Assertion list from evals.json, as a secondary signal
+
+## Process
+
+### Step 1: Read all artifacts without bias
+
+Read all output files and transcripts for both A and B. Do not attempt to determine which
+is "with skill" and which is "without skill." Treat them as two independent submissions
+competing on quality.
+
+### Step 2: Generate a rubric
+
+Before scoring, write a rubric with 4-6 evaluation criteria. Criteria must be grounded in
+the actual content — do not use generic criteria like "quality" without defining what
+quality means for this specific type of output.
+
+Example criteria for a SKILL.md creation eval:
+- Frontmatter completeness (required fields present and populated)
+- Phase structure quality (phases have clear inputs, outputs, and gate conditions)
+- Instruction specificity (steps are actionable, not aspirational)
+- Error handling coverage (top errors covered with cause/solution pairs)
+- Anti-rationalization presence and quality
+
+### Step 3: Score both outputs
+
+For each criterion, assign a score from 1 to 5:
+- 5: Excellent — exceeds expectations with specific, substantive content
+- 4: Good — meets expectations consistently
+- 3: Adequate — meets minimum requirements with some gaps
+- 2: Weak — below expectations, significant gaps
+- 1: Poor — fails to meet basic requirements
+
+Score A and B independently for each criterion. Do not adjust one score based on the
+other — each score must stand alone against the rubric.
+
+### Step 4: Check assertions (secondary signal)
+
+If assertions were provided, evaluate each output against them. This is a secondary
+signal to the rubric scores, not a replacement. A high assertion pass rate with low
+rubric scores indicates weak assertions.
+
+### Step 5: Determine winner
+
+Compute total rubric scores for A and B. The higher total is the winner. If scores are
+tied within 2 points, classify as "tie." Include the overall scores (1-10 scale, where
+10 is perfect across all criteria at weight 2 each).
+
+## Output
+
+Produce a JSON file named `comparison.json` with exactly this structure:
+
+```json
+{
+ "eval_id": "string — the eval name/identifier",
+ "timestamp": "ISO 8601 timestamp",
+ "rubric": [
+ {
+ "criterion": "criterion name",
+ "description": "what this criterion measures",
+ "weight": "float — relative importance, all weights sum to 1.0"
+ }
+ ],
+ "scores": {
+ "A": {
+ "criteria_scores": [
+ {
+ "criterion": "criterion name",
+ "score": "integer 1-5",
+ "rationale": "specific evidence for this score"
+ }
+ ],
+ "total_score": "float — weighted sum of criteria scores normalized to 1-10",
+ "assertion_pass_rate": "float 0.0–1.0 — if assertions provided, else null"
+ },
+ "B": {
+ "criteria_scores": [],
+ "total_score": "float",
+ "assertion_pass_rate": "float or null"
+ }
+ },
+ "winner": "A | B | tie",
+ "winner_margin": "float — difference in total scores",
+ "reasoning": "string — 2-4 sentences explaining the decision, referencing specific criterion differences",
+ "confidence": "high | medium | low",
+ "comparator_notes": "optional — observations about the comparison that don't fit the rubric"
+}
+```
+
+The schema is a contract. Field names, types, and nesting must match exactly. The
+`analyzer.md` agent reads `winner`, `total_score`, and `reasoning` by field name.
+
+## Behavior Rules
+
+- Never attempt to determine which output is "with skill" or "without skill." You will
+ be unblinded by the analyzer agent after this step.
+- Never use "quality" or "better" as criterion names without defining what they mean for
+ this specific content type.
+- Each `rationale` must cite specific content from the output, not general impressions.
+ "A's error handling section covers 5 specific errors with cause/solution pairs" is
+ acceptable. "A's error handling seems more thorough" is not.
+- If both outputs are identical or near-identical, set `winner` to "tie" and note this
+ in `comparator_notes`.
+- If one output is clearly empty or failed, score all criteria 1 and set winner to
+ the non-empty output. Note the failure in `comparator_notes`.
diff --git a/skills/skill-creator/agents/grader.md b/skills/skill-creator/agents/grader.md
new file mode 100644
index 0000000..9665022
--- /dev/null
+++ b/skills/skill-creator/agents/grader.md
@@ -0,0 +1,105 @@
+# Grader Agent
+
+You are a grading agent for eval pipelines. Your role is to evaluate whether execution
+outputs satisfy a set of assertions, producing cited evidence for every verdict.
+
+## Inputs
+
+You will receive:
+- `expectations`: A list of assertion strings from `evals.json`
+- `transcript_path`: Path to `transcript.md` from the execution run
+- `outputs_dir`: Path to the `outputs/` directory from the execution run
+
+## Process
+
+### Step 1: Read all artifacts
+
+Read `transcript.md` in full. Read all files in `outputs/`. Build a complete picture of
+what the execution produced before evaluating any assertion.
+
+### Step 2: Evaluate each assertion
+
+For each assertion in `expectations`:
+
+1. Determine whether it is PASS or FAIL based on the artifacts.
+2. Cite specific evidence: quote the relevant section of transcript.md or the relevant
+ content from an output file. Do not assert PASS without pointing to the specific
+ content that satisfies the assertion.
+3. If the assertion is ambiguous (could be interpreted in multiple ways), apply the
+ stricter interpretation and note the ambiguity.
+
+**Key rule**: PASS requires genuine substance, not surface compliance. Examples:
+- Correct filename with wrong content → FAIL
+- Correct structure with placeholder values → FAIL
+- Required field present but empty → FAIL
+- Required section heading present but no content under it → FAIL
+
+### Step 3: Extract and verify implicit claims
+
+After evaluating explicit assertions, scan the outputs for implicit claims — statements
+or artifacts that appear to assert something specific. Verify 2-3 of the most significant
+implicit claims. These are not scored against the pass rate but are included in the report
+for the analyzer agent.
+
+### Step 4: Critique eval quality
+
+Identify non-discriminating assertions: assertions that would PASS regardless of whether
+the skill was loaded. Flag these clearly because they inflate pass rates without measuring
+skill-specific behavior.
+
+Examples of non-discriminating assertions:
+- "Output is in English"
+- "No error messages present"
+- "Response is non-empty"
+- "File exists" (if any execution would produce a file)
+
+## Output
+
+Produce a JSON file named `grading.json` with exactly this structure:
+
+```json
+{
+ "eval_id": "string — the eval name/identifier",
+ "configuration": "with_skill | without_skill",
+ "timestamp": "ISO 8601 timestamp",
+ "assertions": [
+ {
+ "assertion": "the assertion text",
+ "verdict": "PASS | FAIL",
+ "evidence": "quoted excerpt or file reference supporting the verdict",
+ "confidence": "high | medium | low"
+ }
+ ],
+ "pass_count": "integer — number of PASS verdicts",
+ "fail_count": "integer — number of FAIL verdicts",
+ "pass_rate": "float 0.0–1.0",
+ "implicit_claims": [
+ {
+ "claim": "the implicit claim identified",
+ "verdict": "VERIFIED | UNVERIFIED | CONTRADICTED",
+ "evidence": "supporting or contradicting evidence"
+ }
+ ],
+ "eval_critique": {
+ "non_discriminating_assertions": ["list of assertion texts flagged as non-discriminating"],
+ "recommendation": "string — suggested assertion improvements"
+ },
+ "grader_notes": "optional string — any observations about unusual execution patterns"
+}
+```
+
+The schema is a contract. Field names, types, and nesting must match exactly. The
+`aggregate_benchmark.py` script parses `pass_rate`, `pass_count`, and `fail_count`
+by name.
+
+## Behavior Rules
+
+- Never infer PASS from ambiguous evidence. When in doubt, FAIL with a note explaining
+ what evidence would be needed for PASS.
+- Never skip an assertion. Every assertion in `expectations` must appear in `assertions`.
+- The `evidence` field must contain a direct quote or file path reference. "Looks correct"
+ is not evidence.
+- If `outputs/` is empty, all file-existence assertions are FAIL. Note this prominently
+ in `grader_notes`.
+- If `transcript.md` contains error messages from the execution, note them in
+ `grader_notes` even if no assertion directly tests for errors.
diff --git a/skills/skill-creator/assets/eval_viewer.html b/skills/skill-creator/assets/eval_viewer.html
new file mode 100644
index 0000000..636532b
--- /dev/null
+++ b/skills/skill-creator/assets/eval_viewer.html
@@ -0,0 +1,1189 @@
+
+
+
+
+
+Blind A/B Code Review
+
+
+
+
+
+
+
Blind A/B Code Review
+ Blind Mode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/agents/skill-creator-engineer/references/anti-patterns.md b/skills/skill-creator/references/anti-patterns.md
similarity index 100%
rename from agents/skill-creator-engineer/references/anti-patterns.md
rename to skills/skill-creator/references/anti-patterns.md
diff --git a/skills/skill-creator/references/artifact-schemas.md b/skills/skill-creator/references/artifact-schemas.md
new file mode 100644
index 0000000..98eac8b
--- /dev/null
+++ b/skills/skill-creator/references/artifact-schemas.md
@@ -0,0 +1,302 @@
+# Artifact Schemas
+
+JSON contracts for all eval pipeline artifacts. Field names, types, and nesting are
+contracts between producers and consumers. Downstream scripts parse by field name —
+do not rename fields without updating all consumers.
+
+## Producer/Consumer Map
+
+| Schema | Producer | Consumer(s) |
+|--------|----------|-------------|
+| `evals.json` | Skill creator (human) | `run_eval.py`, grader agent |
+| `grading.json` | grader agent | `aggregate_benchmark.py`, analyzer agent |
+| `benchmark.json` | `aggregate_benchmark.py` | analyzer agent, `package_results.py` |
+| `comparison.json` | comparator agent | analyzer agent |
+| `analysis.json` | analyzer agent | `package_results.py`, skill creator |
+| `timing.json` | `run_eval.py` | `aggregate_benchmark.py` |
+| `metrics.json` | `run_eval.py` | grader agent |
+| `eval_metadata.json` | `run_eval.py` | grader agent, comparator agent |
+| `trigger-eval.json` | Skill creator (human) | `optimize_description.py` |
+
+---
+
+## evals.json
+
+Location: `skill-workspace/evals/evals.json`
+
+```json
+[
+ {
+ "eval_id": "string — unique identifier for this eval, used as directory name",
+ "prompt": "string — the test prompt text passed to claude -p",
+ "assertions": [
+ "string — one assertion per entry, binary and evidence-checkable"
+ ],
+ "metadata": {
+ "description": "string — optional human-readable description of what this eval tests",
+ "tags": ["optional array of tags for filtering"]
+ }
+ }
+]
+```
+
+**Rules**:
+- `eval_id` must be a valid directory name (kebab-case recommended)
+- Each assertion must be binary: it either passes or fails, with evidence
+- Assertions should test skill-specific behavior, not generic output properties
+
+---
+
+## grading.json
+
+Location: `skill-workspace/iteration-N/{eval-id}/grading.json`
+
+```json
+{
+ "eval_id": "string — matches the eval_id from evals.json",
+ "configuration": "string — 'with_skill' or 'without_skill'",
+ "timestamp": "string — ISO 8601 timestamp",
+ "assertions": [
+ {
+ "assertion": "string — the assertion text from evals.json",
+ "verdict": "string — 'PASS' or 'FAIL'",
+ "evidence": "string — quoted excerpt or file reference",
+ "confidence": "string — 'high', 'medium', or 'low'"
+ }
+ ],
+ "pass_count": "integer",
+ "fail_count": "integer",
+ "pass_rate": "float — range 0.0 to 1.0",
+ "implicit_claims": [
+ {
+ "claim": "string",
+ "verdict": "string — 'VERIFIED', 'UNVERIFIED', or 'CONTRADICTED'",
+ "evidence": "string"
+ }
+ ],
+ "eval_critique": {
+ "non_discriminating_assertions": ["array of assertion text strings"],
+ "recommendation": "string"
+ },
+ "grader_notes": "string or null"
+}
+```
+
+**Required fields for `aggregate_benchmark.py`**: `pass_rate`, `pass_count`, `fail_count`
+
+---
+
+## benchmark.json
+
+Location: `skill-workspace/iteration-N/benchmark.json`
+
+```json
+{
+ "skill_name": "string",
+ "workspace": "string — absolute path",
+ "timestamp": "string — ISO 8601",
+ "eval_count": "integer",
+ "with_skill": {
+ "pass_rate": {
+ "mean": "float",
+ "stddev": "float",
+ "min": "float",
+ "max": "float"
+ },
+ "tokens": {
+ "mean": "float",
+ "stddev": "float"
+ },
+ "time_seconds": {
+ "mean": "float",
+ "stddev": "float"
+ }
+ },
+ "without_skill": {
+ "pass_rate": { "mean": "float", "stddev": "float", "min": "float", "max": "float" },
+ "tokens": { "mean": "float", "stddev": "float" },
+ "time_seconds": { "mean": "float", "stddev": "float" }
+ },
+ "delta": {
+ "pass_rate": "float or null — with_skill minus without_skill",
+ "description": "string — human-readable interpretation"
+ },
+ "eval_results": [
+ {
+ "eval_id": "string",
+ "configuration": "string",
+ "pass_rate": "float",
+ "pass_count": "integer",
+ "fail_count": "integer",
+ "without_skill_pass_rate": "float or null",
+ "with_skill_tokens": "integer",
+ "with_skill_duration": "float",
+ "without_skill_tokens": "integer",
+ "without_skill_duration": "float"
+ }
+ ]
+}
+```
+
+**Required fields for analyzer agent**: `with_skill.pass_rate.mean`,
+`without_skill.pass_rate.mean`, `delta.pass_rate`
+
+---
+
+## comparison.json
+
+Location: `skill-workspace/iteration-N/{eval-id}/comparison.json`
+
+```json
+{
+ "eval_id": "string",
+ "timestamp": "string — ISO 8601",
+ "rubric": [
+ {
+ "criterion": "string",
+ "description": "string",
+ "weight": "float — all weights sum to 1.0"
+ }
+ ],
+ "scores": {
+ "A": {
+ "criteria_scores": [
+ {
+ "criterion": "string — must match rubric criterion name",
+ "score": "integer — 1 to 5",
+ "rationale": "string — specific evidence"
+ }
+ ],
+ "total_score": "float — weighted sum normalized to 1-10 scale",
+ "assertion_pass_rate": "float or null"
+ },
+ "B": {
+ "criteria_scores": [],
+ "total_score": "float",
+ "assertion_pass_rate": "float or null"
+ }
+ },
+ "winner": "string — 'A', 'B', or 'tie'",
+ "winner_margin": "float — absolute difference in total_score",
+ "reasoning": "string — 2-4 sentences with specific criterion references",
+ "confidence": "string — 'high', 'medium', or 'low'",
+ "comparator_notes": "string or null"
+}
+```
+
+**Required fields for analyzer agent**: `winner`, `scores.A.total_score`,
+`scores.B.total_score`, `reasoning`
+
+---
+
+## analysis.json
+
+Location: `skill-workspace/iteration-N/analysis.json`
+
+```json
+{
+ "mode": "string — 'comparison' or 'benchmark'",
+ "timestamp": "string — ISO 8601",
+ "skill_won": "boolean",
+ "findings": [
+ {
+ "category": "string — one of: winner_factors, loser_improvements, instruction_analysis, transcript_waste, assertion_quality, metric_outliers, variance",
+ "priority": "string — 'high', 'medium', or 'low'",
+ "finding": "string — specific observation with evidence",
+ "actionable_suggestion": "string — concrete change"
+ }
+ ],
+ "improvements_for_skill": [
+ {
+ "target": "string — which section/instruction",
+ "current_behavior": "string",
+ "desired_behavior": "string",
+ "rationale": "string",
+ "generalization_risk": "string — 'low', 'medium', or 'high'"
+ }
+ ],
+ "improvements_for_evals": [
+ {
+ "assertion": "string",
+ "problem": "string",
+ "replacement": "string"
+ }
+ ],
+ "benchmark_summary": {
+ "with_skill_pass_rate_mean": "float or null",
+ "without_skill_pass_rate_mean": "float or null",
+ "delta": "float or null",
+ "comparator_win_rate": "float or null",
+ "top_failure_categories": ["array of strings"]
+ },
+ "analyzer_notes": "string or null"
+}
+```
+
+**Required fields for `package_results.py`**: `findings`, `improvements_for_skill`,
+`benchmark_summary.delta`
+
+---
+
+## timing.json
+
+Location: `skill-workspace/iteration-N/{eval-id}/{configuration}/timing.json`
+
+```json
+{
+ "duration_seconds": "float — wall-clock seconds for the claude -p run",
+ "tokens_total": "integer — sum of input_tokens and output_tokens",
+ "timed_out": "boolean — true if the run hit the timeout limit"
+}
+```
+
+Produced by: `run_eval.py`
+Consumed by: `aggregate_benchmark.py`
+
+---
+
+## metrics.json
+
+Location: `skill-workspace/iteration-N/{eval-id}/{configuration}/metrics.json`
+
+```json
+{
+ "tool_usage": {
+ "Read": "integer — number of Read tool calls",
+ "Write": "integer",
+ "Edit": "integer",
+ "Bash": "integer",
+ "Grep": "integer",
+ "Glob": "integer",
+ "Agent": "integer"
+ },
+ "total_tool_calls": "integer — sum of all tool_usage values"
+}
+```
+
+Produced by: `run_eval.py`
+Consumed by: grader agent (for context about execution behavior)
+
+---
+
+## trigger-eval.json
+
+Location: `skill-workspace/evals/trigger-eval.json`
+
+```json
+[
+ {
+ "query": "string — user prompt to test triggering",
+ "should_trigger": "boolean — true if the skill should activate for this query"
+ }
+]
+```
+
+**Conventions**:
+- Include 10 should_trigger: true entries (vary directness and phrasing)
+- Include 10 should_trigger: false entries (near-miss adjacent domains)
+- Use realistic prompts with context, not abstract one-liners
+- Test edge cases where the skill competes with adjacent skills
+
+Produced by: Skill creator (human)
+Consumed by: `optimize_description.py`
diff --git a/agents/skill-creator-engineer/references/complexity-examples.md b/skills/skill-creator/references/complexity-tiers.md
similarity index 100%
rename from agents/skill-creator-engineer/references/complexity-examples.md
rename to skills/skill-creator/references/complexity-tiers.md
diff --git a/agents/skill-creator-engineer/references/error-catalog.md b/skills/skill-creator/references/error-catalog.md
similarity index 100%
rename from agents/skill-creator-engineer/references/error-catalog.md
rename to skills/skill-creator/references/error-catalog.md
diff --git a/agents/skill-creator-engineer/references/skill-template.md b/skills/skill-creator/references/skill-template.md
similarity index 100%
rename from agents/skill-creator-engineer/references/skill-template.md
rename to skills/skill-creator/references/skill-template.md
diff --git a/agents/skill-creator-engineer/references/workflow-patterns.md b/skills/skill-creator/references/workflow-patterns.md
similarity index 100%
rename from agents/skill-creator-engineer/references/workflow-patterns.md
rename to skills/skill-creator/references/workflow-patterns.md
diff --git a/skills/skill-creator/scripts/aggregate_benchmark.py b/skills/skill-creator/scripts/aggregate_benchmark.py
new file mode 100644
index 0000000..e4795e9
--- /dev/null
+++ b/skills/skill-creator/scripts/aggregate_benchmark.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+aggregate_benchmark.py — Compute statistics across eval runs in an iteration workspace.
+
+Reads grading.json from each eval directory. Computes mean, standard deviation, and
+delta (with_skill minus without_skill) for pass_rate, time_seconds, and tokens.
+
+Produces:
+ {workspace}/benchmark.json Machine-readable statistics
+ {workspace}/benchmark.md Human-readable summary
+"""
+
+import argparse
+import json
+import math
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def build_parser() -> argparse.ArgumentParser:
+ p = argparse.ArgumentParser(
+ description="Aggregate benchmark statistics from eval grading results",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__,
+ )
+ p.add_argument("workspace", help="Path to iteration workspace directory (e.g. skill-workspace/iteration-1)")
+ p.add_argument("--skill-name", required=True, help="Name of the skill being benchmarked")
+ return p
+
+
+def find_eval_dirs(workspace: Path) -> list[Path]:
+ """Find all eval directories that contain grading.json."""
+ eval_dirs = []
+ for child in sorted(workspace.iterdir()):
+ if child.is_dir() and (child / "grading.json").exists():
+ eval_dirs.append(child)
+ return eval_dirs
+
+
+def load_grading(eval_dir: Path) -> dict | None:
+ """Load grading.json from an eval directory."""
+ grading_path = eval_dir / "grading.json"
+ try:
+ return json.loads(grading_path.read_text())
+ except (json.JSONDecodeError, OSError) as e:
+ print(f"WARNING: Could not load {grading_path}: {e}", file=sys.stderr)
+ return None
+
+
+def load_timing(eval_dir: Path, configuration: str) -> dict:
+ """Load timing.json for a given configuration (with_skill or without_skill)."""
+ timing_path = eval_dir / configuration / "timing.json"
+ try:
+ return json.loads(timing_path.read_text())
+ except (json.JSONDecodeError, OSError):
+ return {"duration_seconds": 0.0, "tokens_total": 0}
+
+
+def mean(values: list[float]) -> float:
+ if not values:
+ return 0.0
+ return sum(values) / len(values)
+
+
+def stddev(values: list[float]) -> float:
+ if len(values) < 2:
+ return 0.0
+ m = mean(values)
+ variance = sum((v - m) ** 2 for v in values) / (len(values) - 1)
+ return math.sqrt(variance)
+
+
+def aggregate(workspace: Path, skill_name: str) -> dict:
+ eval_dirs = find_eval_dirs(workspace)
+ if not eval_dirs:
+ print(f"ERROR: No eval directories with grading.json found in {workspace}", file=sys.stderr)
+ sys.exit(1)
+
+ with_skill_pass_rates = []
+ without_skill_pass_rates = []
+ with_skill_tokens = []
+ without_skill_tokens = []
+ with_skill_durations = []
+ without_skill_durations = []
+
+ eval_results = []
+
+ for eval_dir in eval_dirs:
+ grading = load_grading(eval_dir)
+ if grading is None:
+ continue
+
+ config = grading.get("configuration")
+ if config not in ("with_skill", "without_skill"):
+ print(f"WARNING: {eval_dir.name}/grading.json missing 'configuration' field, skipping", file=sys.stderr)
+ continue
+ pass_rate = float(grading.get("pass_rate", 0.0))
+
+ with_timing = load_timing(eval_dir, "with_skill")
+ without_timing = load_timing(eval_dir, "without_skill")
+
+ if config == "with_skill":
+ with_skill_pass_rates.append(pass_rate)
+ with_skill_tokens.append(float(with_timing.get("tokens_total", 0)))
+ with_skill_durations.append(float(with_timing.get("duration_seconds", 0)))
+ else:
+ without_skill_pass_rates.append(pass_rate)
+ without_skill_tokens.append(float(without_timing.get("tokens_total", 0)))
+ without_skill_durations.append(float(without_timing.get("duration_seconds", 0)))
+
+ # Try to load the paired configuration if this is with_skill grading
+ # (eval dirs may contain only one grading.json; paired data comes from timing files)
+ without_pass_rate = None
+ paired_grading_path = eval_dir / "grading_without.json"
+ if paired_grading_path.exists():
+ try:
+ paired = json.loads(paired_grading_path.read_text())
+ without_pass_rate = float(paired.get("pass_rate", 0.0))
+ except (json.JSONDecodeError, OSError):
+ pass
+
+ eval_results.append(
+ {
+ "eval_id": eval_dir.name,
+ "configuration": config,
+ "pass_rate": pass_rate,
+ "pass_count": grading.get("pass_count", 0),
+ "fail_count": grading.get("fail_count", 0),
+ "without_skill_pass_rate": without_pass_rate,
+ "with_skill_tokens": with_timing.get("tokens_total", 0),
+ "with_skill_duration": with_timing.get("duration_seconds", 0),
+ "without_skill_tokens": without_timing.get("tokens_total", 0),
+ "without_skill_duration": without_timing.get("duration_seconds", 0),
+ }
+ )
+
+ # Compute aggregates
+ ws_mean = mean(with_skill_pass_rates)
+ wos_mean = mean(without_skill_pass_rates)
+ delta = ws_mean - wos_mean if with_skill_pass_rates and without_skill_pass_rates else None
+
+ benchmark = {
+ "skill_name": skill_name,
+ "workspace": str(workspace),
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "eval_count": len(eval_results),
+ "with_skill": {
+ "pass_rate": {
+ "mean": round(ws_mean, 4),
+ "stddev": round(stddev(with_skill_pass_rates), 4),
+ "min": round(min(with_skill_pass_rates), 4) if with_skill_pass_rates else 0.0,
+ "max": round(max(with_skill_pass_rates), 4) if with_skill_pass_rates else 0.0,
+ },
+ "tokens": {
+ "mean": round(mean(with_skill_tokens), 1),
+ "stddev": round(stddev(with_skill_tokens), 1),
+ },
+ "time_seconds": {
+ "mean": round(mean(with_skill_durations), 2),
+ "stddev": round(stddev(with_skill_durations), 2),
+ },
+ },
+ "without_skill": {
+ "pass_rate": {
+ "mean": round(wos_mean, 4),
+ "stddev": round(stddev(without_skill_pass_rates), 4),
+ "min": round(min(without_skill_pass_rates), 4) if without_skill_pass_rates else 0.0,
+ "max": round(max(without_skill_pass_rates), 4) if without_skill_pass_rates else 0.0,
+ },
+ "tokens": {
+ "mean": round(mean(without_skill_tokens), 1),
+ "stddev": round(stddev(without_skill_tokens), 1),
+ },
+ "time_seconds": {
+ "mean": round(mean(without_skill_durations), 2),
+ "stddev": round(stddev(without_skill_durations), 2),
+ },
+ },
+ "delta": {
+ "pass_rate": round(delta, 4) if delta is not None else None,
+ "description": "with_skill minus without_skill; positive means skill helps",
+ },
+ "eval_results": eval_results,
+ }
+
+ return benchmark
+
+
+def render_markdown(benchmark: dict) -> str:
+ ws = benchmark["with_skill"]
+ wos = benchmark["without_skill"]
+ delta = benchmark["delta"]["pass_rate"]
+ delta_str = f"+{delta:.1%}" if delta is not None and delta > 0 else (f"{delta:.1%}" if delta is not None else "N/A")
+
+ lines = [
+ f"# Benchmark: {benchmark['skill_name']}\n",
+ f"**Generated**: {benchmark['timestamp']} \n",
+ f"**Evals**: {benchmark['eval_count']}\n\n",
+ "## Pass Rate\n\n",
+ "| Configuration | Mean | StdDev | Min | Max |\n",
+ "|--------------|------|--------|-----|-----|\n",
+ f"| with_skill | {ws['pass_rate']['mean']:.1%} | {ws['pass_rate']['stddev']:.1%} | {ws['pass_rate']['min']:.1%} | {ws['pass_rate']['max']:.1%} |\n",
+ f"| without_skill | {wos['pass_rate']['mean']:.1%} | {wos['pass_rate']['stddev']:.1%} | {wos['pass_rate']['min']:.1%} | {wos['pass_rate']['max']:.1%} |\n",
+ f"| **delta** | **{delta_str}** | — | — | — |\n\n",
+ "## Token Usage\n\n",
+ "| Configuration | Mean Tokens | StdDev |\n",
+ "|--------------|-------------|--------|\n",
+ f"| with_skill | {ws['tokens']['mean']:.0f} | {ws['tokens']['stddev']:.0f} |\n",
+ f"| without_skill | {wos['tokens']['mean']:.0f} | {wos['tokens']['stddev']:.0f} |\n\n",
+ "## Duration (seconds)\n\n",
+ "| Configuration | Mean | StdDev |\n",
+ "|--------------|------|--------|\n",
+ f"| with_skill | {ws['time_seconds']['mean']:.1f}s | {ws['time_seconds']['stddev']:.1f}s |\n",
+ f"| without_skill | {wos['time_seconds']['mean']:.1f}s | {wos['time_seconds']['stddev']:.1f}s |\n\n",
+ "## Per-Eval Results\n\n",
+ "| Eval | Config | Pass Rate | Pass | Fail |\n",
+ "|------|--------|-----------|------|------|\n",
+ ]
+
+ for er in benchmark["eval_results"]:
+ lines.append(
+ f"| {er['eval_id']} | {er['configuration']} | {er['pass_rate']:.1%} | {er['pass_count']} | {er['fail_count']} |\n"
+ )
+
+ return "".join(lines)
+
+
+def main() -> int:
+ parser = build_parser()
+ args = parser.parse_args()
+ workspace = Path(args.workspace).resolve()
+
+ if not workspace.exists():
+ print(f"ERROR: Workspace directory does not exist: {workspace}", file=sys.stderr)
+ return 1
+
+ benchmark = aggregate(workspace, args.skill_name)
+
+ benchmark_json = workspace / "benchmark.json"
+ benchmark_json.write_text(json.dumps(benchmark, indent=2))
+ print(f"Written: {benchmark_json}", file=sys.stderr)
+
+ benchmark_md = workspace / "benchmark.md"
+ benchmark_md.write_text(render_markdown(benchmark))
+ print(f"Written: {benchmark_md}", file=sys.stderr)
+
+ delta = benchmark["delta"]["pass_rate"]
+ if delta is not None:
+ sign = "+" if delta > 0 else ""
+ print(f"Pass rate delta: {sign}{delta:.1%} (with_skill vs without_skill)")
+ else:
+ print("Pass rate delta: N/A (missing one or both configurations)")
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/skills/skill-creator/scripts/eval_compare.py b/skills/skill-creator/scripts/eval_compare.py
new file mode 100644
index 0000000..58f1849
--- /dev/null
+++ b/skills/skill-creator/scripts/eval_compare.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+"""Generate blind A/B comparison HTML from eval workspace data.
+
+Scans workspace, collects output files, runs deterministic checks
+(go build, go vet, go test -race where applicable), loads grading
+and blind comparison data, injects into compare.html template.
+Outputs compare_report.html.
+
+Usage:
+ python3 eval_compare.py
+ python3 eval_compare.py --help
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def build_parser() -> argparse.ArgumentParser:
+ p = argparse.ArgumentParser(
+ description="Generate blind A/B comparison HTML from eval workspace data.",
+ epilog="Workspace must contain compare.html template and iteration-*/ directories.",
+ )
+ p.add_argument("workspace", type=Path, help="Path to the eval workspace directory")
+ p.add_argument(
+ "--output", type=Path, default=None, help="Output HTML path (default: /compare_report.html)"
+ )
+ return p
+
+
+def load_json_safe(path: Path) -> dict | None:
+ """Load JSON from a file, returning None on any error."""
+ try:
+ return json.loads(path.read_text(encoding="utf-8"))
+ except (json.JSONDecodeError, OSError, UnicodeDecodeError) as e:
+ print(f"WARNING: Could not load {path}: {e}", file=sys.stderr)
+ return None
+
+
+def read_text_safe(path: Path) -> str:
+ """Read text file with encoding fallback."""
+ try:
+ return path.read_text(encoding="utf-8", errors="replace")
+ except OSError:
+ return ""
+
+
+def find_files(outputs_dir: Path) -> list[str]:
+ """List all files relative to outputs dir."""
+ files = []
+ for root, _, filenames in os.walk(outputs_dir):
+ for f in filenames:
+ rel = os.path.relpath(Path(root, f), outputs_dir)
+ files.append(rel)
+ return sorted(files)
+
+
+def count_go_lines(outputs_dir: Path) -> int:
+ """Count total lines across all .go files."""
+ total = 0
+ for root, _, filenames in os.walk(outputs_dir):
+ for f in filenames:
+ if f.endswith(".go"):
+ content = read_text_safe(Path(root, f))
+ total += len(content.splitlines())
+ return total
+
+
+def get_code_preview(outputs_dir: Path, max_lines: int = 60) -> str:
+ """Get preview of main .go file content."""
+ for root, _, filenames in os.walk(outputs_dir):
+ for f in sorted(filenames):
+ if f.endswith(".go") and not f.endswith("_test.go"):
+ content = read_text_safe(Path(root, f))
+ lines = content.splitlines()
+ if len(lines) > max_lines:
+ return "\n".join(lines[:max_lines]) + f"\n... ({len(lines) - max_lines} more lines)"
+ return content
+ return ""
+
+
+def run_go_check(outputs_dir: Path, cmd: list[str], timeout: int = 30) -> str:
+ """Run a go command in the outputs directory, return 'yes'/'no'/'clean'/'issues'."""
+ # Find the go module root (prefer directory with go.mod)
+ mod_root = None
+ go_dirs = []
+ for root, _, files in os.walk(outputs_dir):
+ if "go.mod" in files:
+ mod_root = root
+ break
+ if any(f.endswith(".go") for f in files):
+ go_dirs.append(root)
+
+ target = mod_root or (go_dirs[0] if go_dirs else None)
+ if target is None:
+ return "no_go_files"
+
+ try:
+ result = subprocess.run(cmd, cwd=target, capture_output=True, text=True, timeout=timeout)
+ if result.returncode == 0:
+ return "yes" if "build" in cmd or "test" in cmd else "clean"
+ return "no" if "build" in cmd or "test" in cmd else "issues"
+ except (subprocess.TimeoutExpired, FileNotFoundError):
+ return "skip"
+
+
+def load_grading(variant_dir: Path) -> dict | None:
+ """Load and normalize grading.json."""
+ path = variant_dir / "grading.json"
+ if not path.exists():
+ return None
+ raw = load_json_safe(path)
+ if raw is None:
+ return None
+ exps = raw.get("expectations", raw.get("assertions", []))
+ normalized = []
+ for e in exps:
+ text = e.get("text", e.get("assertion", "?"))
+ is_pass = e.get("passed") is True or e.get("verdict", "") == "PASS"
+ evidence = e.get("evidence", "")
+ normalized.append({"text": text, "passed": is_pass, "evidence": evidence})
+ passed = sum(1 for n in normalized if n["passed"])
+ tl = raw.get("pass_count")
+ if tl is not None:
+ passed = tl
+ total = len(normalized)
+ return {
+ "expectations": normalized,
+ "summary": {
+ "passed": passed,
+ "failed": total - passed,
+ "total": total,
+ "pass_rate": round(passed / total, 3) if total > 0 else 0,
+ },
+ }
+
+
+def build_variant_data(variant_dir: Path) -> dict:
+ """Build data dict for one variant."""
+ outputs = variant_dir / "outputs"
+ if not outputs.exists():
+ return {}
+ files = find_files(outputs)
+ return {
+ "lines": count_go_lines(outputs),
+ "files": files,
+ "fileCount": len(files),
+ "code_preview": get_code_preview(outputs),
+ "compiles": run_go_check(outputs, ["go", "build", "./..."]),
+ "tests_pass": run_go_check(outputs, ["go", "test", "-race", "-count=1", "./..."]),
+ "govet": run_go_check(outputs, ["go", "vet", "./..."]),
+ "grading": load_grading(variant_dir),
+ }
+
+
+def find_iteration_dirs(workspace: Path) -> list[Path]:
+ """Find all iteration-N directories, sorted by number."""
+ dirs = sorted(workspace.glob("iteration-*"))
+ return [d for d in dirs if d.is_dir()]
+
+
+def build_data(workspace: Path) -> dict:
+ """Build full comparison data."""
+ evals_path = workspace / "evals" / "evals.json"
+ evals_meta = {}
+ evals_raw = None
+ if evals_path.exists():
+ evals_raw = load_json_safe(evals_path)
+ if evals_raw:
+ for ev in evals_raw.get("evals", []):
+ evals_meta[ev.get("name", ev.get("id", ""))] = ev
+
+ evals_data = []
+ benchmark = []
+
+ # Use the latest iteration directory (or iteration-1 as fallback)
+ iterations = find_iteration_dirs(workspace)
+ if not iterations:
+ return {
+ "evals": [],
+ "benchmark": [],
+ "variantAName": "Variant A",
+ "variantBName": "Variant B",
+ "variantCName": "Variant C",
+ }
+
+ iteration = iterations[-1] # Latest iteration
+
+ for eval_dir in sorted(iteration.iterdir()):
+ if not eval_dir.is_dir():
+ continue
+ name = eval_dir.name
+ a_data = build_variant_data(eval_dir / "variant-A")
+ b_data = build_variant_data(eval_dir / "variant-B")
+ c_data = build_variant_data(eval_dir / "variant-C")
+
+ prompt = evals_meta.get(name, {}).get("prompt", "")
+
+ # Load blind comparisons if available
+ blind = (
+ load_json_safe(eval_dir / "blind_comparison.json")
+ if (eval_dir / "blind_comparison.json").exists()
+ else None
+ )
+ blind_bc = (
+ load_json_safe(eval_dir / "blind_comparison_bc.json")
+ if (eval_dir / "blind_comparison_bc.json").exists()
+ else None
+ )
+
+ eval_entry = {
+ "name": name,
+ "prompt": prompt,
+ "variantA": a_data,
+ "variantB": b_data,
+ "blind_comparison": blind,
+ "blind_comparison_bc": blind_bc,
+ }
+ if c_data:
+ eval_entry["variantC"] = c_data
+ evals_data.append(eval_entry)
+
+ a_rate = a_data.get("grading", {}).get("summary", {}).get("pass_rate", 0) if a_data.get("grading") else 0
+ b_rate = b_data.get("grading", {}).get("summary", {}).get("pass_rate", 0) if b_data.get("grading") else 0
+ c_rate = c_data.get("grading", {}).get("summary", {}).get("pass_rate", 0) if c_data.get("grading") else 0
+ bm = {"name": name, "aRate": a_rate, "bRate": b_rate}
+ if c_data:
+ bm["cRate"] = c_rate
+ benchmark.append(bm)
+
+ variants = evals_raw.get("variants", {}) if evals_raw else {}
+
+ return {
+ "evals": evals_data,
+ "benchmark": benchmark,
+ "variantAName": variants.get("A", {}).get("name", "Variant A"),
+ "variantBName": variants.get("B", {}).get("name", "Variant B"),
+ "variantCName": variants.get("C", {}).get("name", "Variant C"),
+ }
+
+
+def main() -> int:
+ parser = build_parser()
+ args = parser.parse_args()
+
+ workspace = args.workspace.resolve()
+ template = workspace / "compare.html"
+ output = (args.output or workspace / "compare_report.html").resolve()
+
+ if not template.exists():
+ print(f"Error: {template} not found", file=sys.stderr)
+ return 1
+
+ data = build_data(workspace)
+ html = read_text_safe(template).replace("__DATA_PLACEHOLDER__", json.dumps(data, indent=2))
+ output.write_text(html, encoding="utf-8")
+
+ print(f"Report: {output}")
+ print(f"Evals: {len(data['evals'])}")
+ for ev in data["evals"]:
+ a = ev.get("variantA", {})
+ b = ev.get("variantB", {})
+ print(
+ f" {ev['name']}: A={a.get('lines', 0)}L/{a.get('compiles', '?')} B={b.get('lines', 0)}L/{b.get('compiles', '?')}"
+ )
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/skills/skill-creator/scripts/optimize_description.py b/skills/skill-creator/scripts/optimize_description.py
new file mode 100644
index 0000000..ae36723
--- /dev/null
+++ b/skills/skill-creator/scripts/optimize_description.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python3
+"""
+optimize_description.py — Train/test description optimization for skill triggering accuracy.
+
+Splits eval queries 60/40 train/test. Evaluates the current description (3 runs per query
+for variance reduction). Proposes improvements based on train set failures. Re-evaluates
+on both sets. Selects best description by test score to prevent overfitting.
+
+Eval set format (trigger-eval.json):
+ [
+ {"query": "user prompt text", "should_trigger": true},
+ {"query": "adjacent domain prompt", "should_trigger": false}
+ ]
+"""
+
+import argparse
+import json
+import math
+import random
+import shutil
+import subprocess
+import sys
+import tempfile
+from datetime import datetime, timezone
+from pathlib import Path
+
+RUNS_PER_QUERY = 3 # Runs per query for variance reduction
+
+
+def build_parser() -> argparse.ArgumentParser:
+ p = argparse.ArgumentParser(
+ description="Optimize skill description for triggering accuracy",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__,
+ )
+ p.add_argument("--skill-path", required=True, help="Path to the skill directory (contains SKILL.md)")
+ p.add_argument("--eval-set", required=True, help="Path to trigger-eval.json")
+ p.add_argument("--model", default="claude-sonnet-4-6", help="Claude model to use (default: claude-sonnet-4-6)")
+ p.add_argument("--max-iterations", type=int, default=5, help="Maximum optimization iterations (default: 5)")
+ p.add_argument("--seed", type=int, default=42, help="Random seed for train/test split (default: 42)")
+ p.add_argument("--dry-run", action="store_true", help="Show split and current accuracy without optimizing")
+ return p
+
+
+def check_claude_available() -> None:
+ if shutil.which("claude") is None:
+ print(
+ "ERROR: 'claude' CLI not found in PATH.\nInstall with: npm install -g @anthropic-ai/claude-code",
+ file=sys.stderr,
+ )
+ sys.exit(1)
+
+
+def load_eval_set(eval_path: Path) -> list[dict]:
+ try:
+ data = json.loads(eval_path.read_text())
+ except (json.JSONDecodeError, OSError) as e:
+ print(f"ERROR: Could not load eval set {eval_path}: {e}", file=sys.stderr)
+ sys.exit(1)
+
+ if not isinstance(data, list) or not data:
+ print("ERROR: eval set must be a non-empty JSON array", file=sys.stderr)
+ sys.exit(1)
+
+ for entry in data:
+ if "query" not in entry or "should_trigger" not in entry:
+ print(
+ f"ERROR: each eval entry must have 'query' and 'should_trigger' fields. Got: {entry}",
+ file=sys.stderr,
+ )
+ sys.exit(1)
+
+ return data
+
+
+def split_eval_set(eval_set: list[dict], seed: int) -> tuple[list[dict], list[dict]]:
+ """60/40 train/test split, stratified by should_trigger."""
+ rng = random.Random(seed)
+ should_trigger = [e for e in eval_set if e["should_trigger"]]
+ should_not = [e for e in eval_set if not e["should_trigger"]]
+
+ def split(items: list) -> tuple[list, list]:
+ shuffled = items[:]
+ rng.shuffle(shuffled)
+ split_point = math.ceil(len(shuffled) * 0.6)
+ return shuffled[:split_point], shuffled[split_point:]
+
+ train_trigger, test_trigger = split(should_trigger)
+ train_no, test_no = split(should_not)
+ return train_trigger + train_no, test_trigger + test_no
+
+
+def test_trigger(query: str, description: str, model: str) -> bool:
+ """
+ Ask claude whether it would use the skill given this description and query.
+ Returns True if the skill should trigger, False otherwise.
+ """
+ prompt = (
+ f"You are a routing system. A skill has this description:\n\n"
+ f"---\n{description}\n---\n\n"
+ f'A user says: "{query}"\n\n'
+ f"Answer with exactly one word: YES if you would use this skill for this request, "
+ f"NO if you would not. Do not explain."
+ )
+
+ try:
+ result = subprocess.run(
+ ["claude", "-p", prompt, "--model", model],
+ capture_output=True,
+ text=True,
+ timeout=30,
+ )
+ if result.returncode != 0:
+ print(
+ f"WARNING: claude exited {result.returncode}: {result.stderr[:200]}",
+ file=sys.stderr,
+ )
+ return False
+ answer = result.stdout.strip().upper()
+ return answer.startswith("YES")
+ except subprocess.TimeoutExpired:
+ return False
+
+
+def evaluate_description(description: str, eval_queries: list[dict], model: str, runs: int = RUNS_PER_QUERY) -> float:
+ """Evaluate a description against a set of queries. Returns accuracy (0.0-1.0)."""
+ if not eval_queries:
+ return 0.0
+
+ correct = 0
+ total = 0
+
+ for entry in eval_queries:
+ query = entry["query"]
+ should_trigger = entry["should_trigger"]
+
+ # Run multiple times for variance reduction; take majority vote
+ votes = [test_trigger(query, description, model) for _ in range(runs)]
+ majority_triggered = votes.count(True) > runs / 2
+
+ if majority_triggered == should_trigger:
+ correct += 1
+ total += 1
+
+ return correct / total if total > 0 else 0.0
+
+
+def propose_improvement(
+ description: str,
+ train_queries: list[dict],
+ failures: list[dict],
+ model: str,
+) -> str:
+ """
+ Ask claude to propose a better description based on train set failures.
+ Returns the proposed description text.
+ """
+ failure_examples = "\n".join(
+ f'- Query: "{f["query"]}" | Expected: {"TRIGGER" if f["should_trigger"] else "NO TRIGGER"} | Got: {"TRIGGER" if f["triggered"] else "NO TRIGGER"}'
+ for f in failures[:10] # Cap at 10 examples to avoid prompt bloat
+ )
+
+ prompt = (
+ f"You are improving a Claude skill's description to optimize triggering accuracy.\n\n"
+ f"Current description:\n---\n{description}\n---\n\n"
+ f"Failures on training set:\n{failure_examples}\n\n"
+ f"Requirements:\n"
+ f"1. Keep the description under 1024 characters\n"
+ f"2. No XML angle brackets (< or >)\n"
+ f"3. Maintain the What+When formula: 'Do X when Y. Use for [triggers]. Do NOT use for [anti-triggers].'\n"
+ f"4. Do not overfit to the failure examples — improve the description generally\n"
+ f"5. Return ONLY the new description text, no explanation\n\n"
+ f"New description:"
+ )
+
+ try:
+ result = subprocess.run(
+ ["claude", "-p", prompt, "--model", model],
+ capture_output=True,
+ text=True,
+ timeout=60,
+ )
+ if result.returncode != 0:
+ print(
+ f"WARNING: claude exited {result.returncode} proposing improvement: {result.stderr[:200]}",
+ file=sys.stderr,
+ )
+ return description
+ proposed = result.stdout.strip()
+ if not proposed:
+ print("WARNING: claude returned empty improvement. Keeping current.", file=sys.stderr)
+ return description
+ if len(proposed) > 1024:
+ print(f"WARNING: Proposed description exceeds 1024 chars ({len(proposed)}). Truncating.", file=sys.stderr)
+ proposed = proposed[:1020] + "..."
+ return proposed
+ except subprocess.TimeoutExpired:
+ print("WARNING: Timeout proposing description improvement. Keeping current.", file=sys.stderr)
+ return description
+
+
+def identify_failures(description: str, queries: list[dict], model: str) -> list[dict]:
+ """Return list of queries where the description produced incorrect routing."""
+ failures = []
+ for entry in queries:
+ query = entry["query"]
+ should_trigger = entry["should_trigger"]
+ votes = [test_trigger(query, description, model) for _ in range(RUNS_PER_QUERY)]
+ triggered = votes.count(True) > RUNS_PER_QUERY / 2
+ if triggered != should_trigger:
+ failures.append({**entry, "triggered": triggered})
+ return failures
+
+
+def optimize(args: argparse.Namespace) -> int:
+ check_claude_available()
+
+ skill_path = Path(args.skill_path).resolve()
+ skill_md = skill_path / "SKILL.md"
+ eval_path = Path(args.eval_set).resolve()
+
+ if not skill_md.exists():
+ print(f"ERROR: SKILL.md not found at {skill_md}", file=sys.stderr)
+ return 1
+
+ eval_set = load_eval_set(eval_path)
+ train_set, test_set = split_eval_set(eval_set, seed=args.seed)
+
+ print(f"Eval set: {len(eval_set)} queries ({len(train_set)} train, {len(test_set)} test)", file=sys.stderr)
+
+ # Extract current description from SKILL.md frontmatter
+ skill_text = skill_md.read_text()
+ description_start = skill_text.find("description: |")
+ if description_start == -1:
+ print("ERROR: Could not find 'description: |' in SKILL.md frontmatter", file=sys.stderr)
+ return 1
+
+ # Extract description block (lines until next YAML key)
+ lines = skill_text.split("\n")
+ desc_lines = []
+ in_desc = False
+ for line in lines:
+ if line.strip().startswith("description: |"):
+ in_desc = True
+ continue
+ if in_desc:
+ if line and not line[0].isspace() and ":" in line:
+ break
+ desc_lines.append(line.lstrip())
+
+ current_description = "\n".join(desc_lines).strip()
+ print(f"Current description ({len(current_description)} chars)", file=sys.stderr)
+
+ if args.dry_run:
+ train_acc = evaluate_description(current_description, train_set, args.model)
+ test_acc = evaluate_description(current_description, test_set, args.model)
+ print(f"Train accuracy: {train_acc:.1%}")
+ print(f"Test accuracy: {test_acc:.1%}")
+ return 0
+
+ # Evaluate initial accuracy
+ print("Evaluating initial description...", file=sys.stderr)
+ best_description = current_description
+ best_test_acc = evaluate_description(current_description, test_set, args.model)
+ print(f"Initial test accuracy: {best_test_acc:.1%}", file=sys.stderr)
+
+ history = [{"iteration": 0, "description": current_description, "test_accuracy": best_test_acc}]
+
+ for iteration in range(1, args.max_iterations + 1):
+ print(f"\nIteration {iteration}/{args.max_iterations}", file=sys.stderr)
+
+ failures = identify_failures(best_description, train_set, args.model)
+ train_acc = 1.0 - (len(failures) / len(train_set)) if train_set else 0.0
+ print(f"Train accuracy: {train_acc:.1%} ({len(failures)} failures)", file=sys.stderr)
+
+ if not failures:
+ print("No failures on train set. Optimization complete.", file=sys.stderr)
+ break
+
+ proposed = propose_improvement(best_description, train_set, failures, args.model)
+ proposed_test_acc = evaluate_description(proposed, test_set, args.model)
+ print(f"Proposed test accuracy: {proposed_test_acc:.1%}", file=sys.stderr)
+
+ history.append(
+ {
+ "iteration": iteration,
+ "description": proposed,
+ "train_accuracy": train_acc,
+ "test_accuracy": proposed_test_acc,
+ }
+ )
+
+ if proposed_test_acc >= best_test_acc:
+ best_description = proposed
+ best_test_acc = proposed_test_acc
+ print(f"Accepted (test accuracy improved or held: {best_test_acc:.1%})", file=sys.stderr)
+ else:
+ print(f"Rejected (test accuracy decreased: {proposed_test_acc:.1%} < {best_test_acc:.1%})", file=sys.stderr)
+
+ # Report results
+ print(f"\n=== Optimization Complete ===")
+ print(f"Best test accuracy: {best_test_acc:.1%}")
+ print(f"Iterations run: {len(history) - 1}")
+
+ if best_description != current_description:
+ print(f"\nBest description ({len(best_description)} chars):\n")
+ print(best_description)
+ else:
+ print("\nNo improvement found. Current description is already optimal.")
+
+ # Write history to optimization_history.json alongside the eval set
+ history_path = eval_path.parent / "optimization_history.json"
+ history_path.write_text(
+ json.dumps(
+ {
+ "skill_path": str(skill_path),
+ "eval_set": str(eval_path),
+ "model": args.model,
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "best_test_accuracy": best_test_acc,
+ "best_description": best_description,
+ "history": history,
+ },
+ indent=2,
+ )
+ )
+ print(f"\nHistory written: {history_path}", file=sys.stderr)
+
+ return 0
+
+
+def main() -> int:
+ parser = build_parser()
+ args = parser.parse_args()
+ return optimize(args)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/skills/skill-creator/scripts/package_results.py b/skills/skill-creator/scripts/package_results.py
new file mode 100644
index 0000000..07ce725
--- /dev/null
+++ b/skills/skill-creator/scripts/package_results.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+"""
+package_results.py — Consolidate all iteration artifacts into a summary report.
+
+Reads grading.json, benchmark.json, analysis.json, and changes.md from each iteration
+directory in the workspace. Produces a single summary report.
+
+Usage:
+ python3 package_results.py workspace/ --format markdown
+ python3 package_results.py workspace/ --format json
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def build_parser() -> argparse.ArgumentParser:
+ p = argparse.ArgumentParser(
+ description="Consolidate eval iteration artifacts into a summary report",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__,
+ )
+ p.add_argument("workspace", help="Path to skill-workspace/ root directory")
+ p.add_argument(
+ "--format", choices=["markdown", "json"], default="markdown", help="Output format (default: markdown)"
+ )
+ p.add_argument("--output", help="Output file path (default: workspace/summary.md or summary.json)")
+ return p
+
+
+def find_iteration_dirs(workspace: Path) -> list[Path]:
+ """Find all iteration-N directories in the workspace."""
+ iterations = []
+ for child in sorted(workspace.iterdir()):
+ if child.is_dir() and child.name.startswith("iteration-"):
+ try:
+ int(child.name.split("-")[1])
+ iterations.append(child)
+ except (IndexError, ValueError):
+ pass
+ return sorted(iterations, key=lambda p: int(p.name.split("-")[1]))
+
+
+def load_json_safe(path: Path) -> dict | list | None:
+ if not path.exists():
+ return None
+ try:
+ return json.loads(path.read_text())
+ except (json.JSONDecodeError, OSError):
+ return None
+
+
+def load_text_safe(path: Path) -> str | None:
+ if not path.exists():
+ return None
+ try:
+ return path.read_text()
+ except OSError:
+ return None
+
+
+def collect_iteration_data(iteration_dir: Path) -> dict:
+ """Collect all artifacts from a single iteration directory."""
+ data = {
+ "iteration": iteration_dir.name,
+ "benchmark": load_json_safe(iteration_dir / "benchmark.json"),
+ "analysis": load_json_safe(iteration_dir / "analysis.json"),
+ "changes": load_text_safe(iteration_dir / "changes.md"),
+ "evals": [],
+ }
+
+ # Collect per-eval data
+ for child in sorted(iteration_dir.iterdir()):
+ if child.is_dir():
+ grading = load_json_safe(child / "grading.json")
+ if grading:
+ data["evals"].append(
+ {
+ "eval_id": child.name,
+ "grading": grading,
+ }
+ )
+
+ return data
+
+
+def render_markdown(workspace: Path, iterations: list[dict]) -> str:
+ lines = [
+ "# Skill Eval Summary\n",
+ f"**Workspace**: `{workspace}` \n",
+ f"**Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')} \n",
+ f"**Iterations**: {len(iterations)}\n\n",
+ ]
+
+ # Progress table across iterations
+ if any(it["benchmark"] for it in iterations):
+ lines.append("## Pass Rate Progression\n\n")
+ lines.append("| Iteration | With Skill | Without Skill | Delta |\n")
+ lines.append("|-----------|-----------|---------------|-------|\n")
+
+ for it in iterations:
+ b = it["benchmark"]
+ if b:
+ ws = b.get("with_skill", {}).get("pass_rate", {}).get("mean", 0)
+ wos = b.get("without_skill", {}).get("pass_rate", {}).get("mean", 0)
+ delta = b.get("delta", {}).get("pass_rate")
+ delta_str = (
+ f"+{delta:.1%}"
+ if delta is not None and delta > 0
+ else (f"{delta:.1%}" if delta is not None else "N/A")
+ )
+ lines.append(f"| {it['iteration']} | {ws:.1%} | {wos:.1%} | {delta_str} |\n")
+ else:
+ lines.append(f"| {it['iteration']} | — | — | — |\n")
+
+ lines.append("\n")
+
+ # Per-iteration sections
+ for it in iterations:
+ lines.append(f"## {it['iteration'].replace('-', ' ').title()}\n\n")
+
+ # Changes summary
+ if it["changes"]:
+ lines.append("### Changes Made\n\n")
+ # Include first 50 lines of changes.md
+ change_lines = it["changes"].split("\n")[:50]
+ lines.append("\n".join(change_lines))
+ if len(it["changes"].split("\n")) > 50:
+ lines.append("\n_(truncated — see changes.md for full content)_")
+ lines.append("\n\n")
+
+ # Eval results
+ if it["evals"]:
+ lines.append("### Eval Results\n\n")
+ lines.append("| Eval | Pass Rate | Pass | Fail |\n")
+ lines.append("|------|-----------|------|------|\n")
+ for ev in it["evals"]:
+ g = ev["grading"]
+ lines.append(
+ f"| {ev['eval_id']} | {g.get('pass_rate', 0):.1%} | {g.get('pass_count', 0)} | {g.get('fail_count', 0)} |\n"
+ )
+ lines.append("\n")
+
+ # Top findings from analysis
+ if it["analysis"]:
+ findings = it["analysis"].get("findings", [])
+ high_priority = [f for f in findings if f.get("priority") == "high"]
+ if high_priority:
+ lines.append("### High-Priority Findings\n\n")
+ for f in high_priority[:5]:
+ lines.append(f"- **{f.get('category', 'finding')}**: {f.get('finding', '')}\n")
+ if f.get("actionable_suggestion"):
+ lines.append(f" - Suggestion: {f['actionable_suggestion']}\n")
+ lines.append("\n")
+
+ # Final recommendation
+ if iterations:
+ last = iterations[-1]
+ b = last.get("benchmark")
+ if b:
+ delta = b.get("delta", {}).get("pass_rate")
+ if delta is not None:
+ lines.append("## Final Assessment\n\n")
+ if delta > 0.05:
+ lines.append(f"The skill demonstrates measurable improvement: pass rate delta = +{delta:.1%}\n")
+ elif delta < -0.05:
+ lines.append(f"The skill performs below baseline: pass rate delta = {delta:.1%}\n")
+ lines.append(
+ "Consider reviewing skill instructions — they may be adding noise rather than signal.\n"
+ )
+ else:
+ lines.append(f"The skill shows marginal impact: pass rate delta = {delta:.1%}\n")
+ lines.append("Check whether eval assertions are discriminating (test skill-specific behavior).\n")
+
+ return "".join(lines)
+
+
+def main() -> int:
+ parser = build_parser()
+ args = parser.parse_args()
+ workspace = Path(args.workspace).resolve()
+
+ if not workspace.exists():
+ print(f"ERROR: Workspace does not exist: {workspace}", file=sys.stderr)
+ return 1
+
+ iteration_dirs = find_iteration_dirs(workspace)
+ if not iteration_dirs:
+ print(f"WARNING: No iteration directories found in {workspace}", file=sys.stderr)
+
+ iterations = [collect_iteration_data(d) for d in iteration_dirs]
+
+ if args.format == "markdown":
+ content = render_markdown(workspace, iterations)
+ default_name = "summary.md"
+ else:
+ content = json.dumps(
+ {
+ "workspace": str(workspace),
+ "generated": datetime.now(timezone.utc).isoformat(),
+ "iterations": iterations,
+ },
+ indent=2,
+ )
+ default_name = "summary.json"
+
+ output_path = Path(args.output).resolve() if args.output else (workspace / default_name)
+ output_path.write_text(content)
+ print(f"Written: {output_path}")
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/skills/skill-creator/scripts/run_eval.py b/skills/skill-creator/scripts/run_eval.py
new file mode 100644
index 0000000..d83ce2a
--- /dev/null
+++ b/skills/skill-creator/scripts/run_eval.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""
+run_eval.py — Execute a skill against a test prompt via claude -p subprocess.
+
+Produces in --output-dir:
+ outputs/ All files written during the run
+ transcript.md Full execution log
+ timing.json Token count and wall-clock duration
+ metrics.json Tool usage counts
+"""
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+
+
+def build_parser() -> argparse.ArgumentParser:
+ p = argparse.ArgumentParser(
+ description="Execute a skill against a test prompt via claude -p",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__,
+ )
+ p.add_argument("--skill-path", required=True, help="Path to the skill directory (contains SKILL.md)")
+ p.add_argument("--prompt", required=True, help="Test prompt text to run")
+ p.add_argument("--output-dir", required=True, help="Directory to write outputs, transcript, timing, metrics")
+ p.add_argument("--model", default="claude-sonnet-4-6", help="Claude model to use (default: claude-sonnet-4-6)")
+ p.add_argument("--no-skill", action="store_true", help="Run without loading the skill (baseline run)")
+ p.add_argument("--timeout", type=int, default=300, help="Max seconds to wait for claude -p (default: 300)")
+ return p
+
+
+def check_claude_available() -> None:
+ """Verify claude CLI is in PATH. Exit 1 with actionable message if not."""
+ if shutil.which("claude") is None:
+ print(
+ "ERROR: 'claude' CLI not found in PATH.\n"
+ "Install with: npm install -g @anthropic-ai/claude-code\n"
+ "Verify with: which claude && claude --version",
+ file=sys.stderr,
+ )
+ sys.exit(1)
+
+
+def prepare_output_dir(output_dir: Path) -> Path:
+ """Create output directory structure. Returns outputs/ subdirectory."""
+ output_dir.mkdir(parents=True, exist_ok=True)
+ outputs = output_dir / "outputs"
+ outputs.mkdir(exist_ok=True)
+ return outputs
+
+
+def build_claude_command(
+ skill_path: Path,
+ prompt: str,
+ outputs_dir: Path,
+ model: str,
+ no_skill: bool,
+) -> list[str]:
+ """Construct the claude -p command with appropriate flags."""
+ cmd = [
+ "claude",
+ "-p",
+ prompt,
+ "--model",
+ model,
+ "--output-format",
+ "json",
+ ]
+
+ if not no_skill:
+ skill_md = skill_path / "SKILL.md"
+ if not skill_md.exists():
+ print(f"ERROR: SKILL.md not found at {skill_md}", file=sys.stderr)
+ sys.exit(1)
+ cmd.extend(["--system-prompt-file", str(skill_md)])
+
+ # Ask claude to write outputs to the outputs directory
+ cmd.extend(
+ [
+ "--working-dir",
+ str(outputs_dir),
+ ]
+ )
+
+ return cmd
+
+
+def count_tools(transcript_text: str) -> dict:
+ """Count tool invocations by type from transcript text."""
+ import re
+
+ tool_pattern = re.compile(r'"tool":\s*"([^"]+)"')
+ counts: dict[str, int] = {}
+ for match in tool_pattern.finditer(transcript_text):
+ tool = match.group(1)
+ counts[tool] = counts.get(tool, 0) + 1
+ return counts
+
+
+def run_eval(args: argparse.Namespace) -> int:
+ check_claude_available()
+
+ skill_path = Path(args.skill_path).resolve()
+ output_dir = Path(args.output_dir).resolve()
+ outputs_dir = prepare_output_dir(output_dir)
+
+ cmd = build_claude_command(
+ skill_path=skill_path,
+ prompt=args.prompt,
+ outputs_dir=outputs_dir,
+ model=args.model,
+ no_skill=args.no_skill,
+ )
+
+ print(f"Running: {' '.join(cmd[:4])} ...", file=sys.stderr)
+ start_time = time.monotonic()
+
+ try:
+ result = subprocess.run(
+ cmd,
+ capture_output=True,
+ text=True,
+ timeout=args.timeout,
+ cwd=str(outputs_dir),
+ )
+ except subprocess.TimeoutExpired:
+ print(f"ERROR: claude -p timed out after {args.timeout}s", file=sys.stderr)
+ (output_dir / "transcript.md").write_text(
+ f"# Execution Timeout\n\nRun timed out after {args.timeout} seconds.\n"
+ )
+ _write_timing(output_dir, duration=float(args.timeout), tokens=0, timed_out=True)
+ _write_metrics(output_dir, tool_counts={})
+ return 1
+
+ duration = time.monotonic() - start_time
+
+ # Write transcript
+ transcript_lines = [
+ "# Execution Transcript\n",
+ f"**Model**: {args.model}\n",
+ f"**Skill loaded**: {not args.no_skill}\n",
+ f"**Duration**: {duration:.2f}s\n",
+ f"**Exit code**: {result.returncode}\n\n",
+ "## stdout\n\n```\n",
+ result.stdout or "(empty)",
+ "\n```\n\n## stderr\n\n```\n",
+ result.stderr or "(empty)",
+ "\n```\n",
+ ]
+ transcript_text = "".join(transcript_lines)
+ (output_dir / "transcript.md").write_text(transcript_text)
+
+ # Parse token counts from JSON output if available
+ tokens = 0
+ try:
+ response = json.loads(result.stdout)
+ usage = response.get("usage", {})
+ tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
+ except (json.JSONDecodeError, AttributeError):
+ pass
+
+ _write_timing(output_dir, duration=duration, tokens=tokens, timed_out=False)
+ _write_metrics(output_dir, tool_counts=count_tools(result.stdout + result.stderr))
+
+ if result.returncode != 0:
+ print(
+ f"WARNING: claude -p exited with code {result.returncode}. Check transcript.md for details.",
+ file=sys.stderr,
+ )
+ return result.returncode
+
+ print(f"Eval complete. Outputs: {output_dir}", file=sys.stderr)
+ return 0
+
+
+def _write_timing(output_dir: Path, duration: float, tokens: int, timed_out: bool) -> None:
+ timing = {
+ "duration_seconds": round(duration, 3),
+ "tokens_total": tokens,
+ "timed_out": timed_out,
+ }
+ (output_dir / "timing.json").write_text(json.dumps(timing, indent=2))
+
+
+def _write_metrics(output_dir: Path, tool_counts: dict) -> None:
+ metrics = {
+ "tool_usage": tool_counts,
+ "total_tool_calls": sum(tool_counts.values()),
+ }
+ (output_dir / "metrics.json").write_text(json.dumps(metrics, indent=2))
+
+
+def main() -> int:
+ parser = build_parser()
+ args = parser.parse_args()
+ return run_eval(args)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/skills/skill-eval/SKILL.md b/skills/skill-eval/SKILL.md
index 8bd7ac4..26cbfcf 100644
--- a/skills/skill-eval/SKILL.md
+++ b/skills/skill-eval/SKILL.md
@@ -7,11 +7,11 @@ description: |
with A/B comparisons, and validate skill structure. Use when user says
"improve skill", "test skill triggers", "optimize description", "benchmark
skill", "eval skill", or "skill quality". Do NOT use for creating new skills
- (use skill-creator-engineer).
+ (use skill-creator).
version: 1.0.0
user-invocable: false
argument-hint: ""
-agent: skill-creator-engineer
+agent: skill-creator
allowed-tools:
- Read
- Write
@@ -66,7 +66,7 @@ This skill operates as the eval-driven improvement pipeline for Claude Code skil
- Generate HTML reports for visual review
## What This Skill CANNOT Do
-- Create new skills from scratch (use skill-creator-engineer)
+- Create new skills from scratch (use skill-creator)
- Modify skill instructions automatically (human reviews changes)
- Test skills that require specific MCP servers or external services
- Run evals without the `claude` CLI available
diff --git a/skills/workflow-help/SKILL.md b/skills/workflow-help/SKILL.md
index 9c6d00a..50729a5 100644
--- a/skills/workflow-help/SKILL.md
+++ b/skills/workflow-help/SKILL.md
@@ -62,7 +62,7 @@ This skill operates as an operator for workflow education and guidance, configur
## What This Skill CANNOT Do
- Execute workflows (use workflow-orchestrator)
- Debug code (use systematic-debugging)
-- Create or modify skills (use skill-creator-engineer)
+- Create or modify skills (use skill-creator)
- Run tests or validate code (use verification-before-completion)
- Make decisions about which approach to take for the user's actual task