diff --git a/.claude/settings.json b/.claude/settings.json index ed78662..d77278a 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -96,41 +96,57 @@ ], "PreToolUse": [ { + "matcher": "Bash|Write|Edit", "hooks": [ { "type": "command", "command": "python3 \"$HOME/.claude/hooks/pretool-unified-gate.py\"", "description": "Unified gate: gitignore-bypass, git-submission, dangerous-command, creation-gate, sensitive-file (ADR-068)", "timeout": 3000 - }, + } + ] + }, + { + "matcher": "Bash", + "hooks": [ { "type": "command", - "command": "python3 \"$HOME/.claude/hooks/pretool-synthesis-gate.py\"", - "description": "Consultation synthesis gate: blocks implementation when ADR consultation is incomplete", + "command": "python3 \"$HOME/.claude/hooks/pretool-branch-safety.py\"", + "description": "Branch safety: blocks git commit on main/master, forces feature branches", "timeout": 3000 }, { "type": "command", - "command": "python3 \"$HOME/.claude/hooks/pretool-branch-safety.py\"", - "description": "Branch safety: blocks git commit on main/master, forces feature branches", + "command": "python3 \"$HOME/.claude/hooks/ci-merge-gate.py\"", + "description": "Gate: block merge to main/master when CI checks are red", "timeout": 3000 - }, + } + ] + }, + { + "matcher": "Bash|Edit", + "hooks": [ { "type": "command", - "command": "python3 \"$HOME/.claude/hooks/pretool-plan-gate.py\"", - "description": "Plan gate: blocks implementation code without task_plan.md", + "command": "python3 \"$HOME/.claude/hooks/pretool-learning-injector.py\"", + "description": "Inject known error patterns before Bash/Edit tools run", "timeout": 3000 - }, + } + ] + }, + { + "matcher": "Write|Edit", + "hooks": [ { "type": "command", - "command": "python3 \"$HOME/.claude/hooks/pretool-adr-creation-gate.py\"", - "description": "ADR creation gate: blocks new components without an ADR in adr/", + "command": "python3 \"$HOME/.claude/hooks/pretool-synthesis-gate.py\"", + "description": "Consultation synthesis gate: blocks implementation when ADR consultation is incomplete", "timeout": 3000 }, { "type": "command", - "command": "python3 \"$HOME/.claude/hooks/pretool-learning-injector.py\"", - "description": "Inject known error patterns before Bash/Edit tools run", + "command": "python3 \"$HOME/.claude/hooks/pretool-plan-gate.py\"", + "description": "Plan gate: blocks implementation code without task_plan.md", "timeout": 3000 }, { @@ -138,39 +154,51 @@ "command": "python3 \"$HOME/.claude/hooks/pretool-prompt-injection-scanner.py\"", "description": "Advisory scan for prompt injection patterns in agent context files (ADR-070)", "timeout": 3000 - }, + } + ] + }, + { + "matcher": "Write", + "hooks": [ { "type": "command", - "command": "python3 \"$HOME/.claude/hooks/pretool-subagent-warmstart.py\"", - "description": "Inject parent session context into subagent prompts (ADR-088)", - "timeout": 5000 - }, + "command": "python3 \"$HOME/.claude/hooks/pretool-adr-creation-gate.py\"", + "description": "ADR creation gate: blocks new components without an ADR in adr/", + "timeout": 3000 + } + ] + }, + { + "matcher": "Edit", + "hooks": [ { "type": "command", "command": "python3 \"$HOME/.claude/hooks/pretool-file-backup.py\"", "description": "Backup files before Edit tool modifies them", "timeout": 3000 - }, + } + ] + }, + { + "matcher": "Agent", + "hooks": [ { "type": "command", - "command": "python3 \"$HOME/.claude/hooks/ci-merge-gate.py\"", - "description": "Gate: block merge to main/master when CI checks are red", - "timeout": 3000 + "command": "python3 \"$HOME/.claude/hooks/pretool-subagent-warmstart.py\"", + "description": "Inject parent session context into subagent prompts (ADR-088)", + "timeout": 5000 } ] } ], "PostToolUse": [ { + "matcher": "Write|Edit", "hooks": [ { "type": "command", - "command": "python3 \"$HOME/.claude/hooks/post-tool-lint-hint.py\"" - }, - { - "type": "command", - "command": "python3 \"$HOME/.claude/hooks/error-learner.py\"", - "description": "Learn from tool errors and suggest solutions" + "command": "python3 \"$HOME/.claude/hooks/post-tool-lint-hint.py\"", + "description": "Gentle lint reminder after file modifications" }, { "type": "command", @@ -185,48 +213,82 @@ }, { "type": "command", - "command": "python3 \"$HOME/.claude/hooks/routing-gap-recorder.py\"", - "description": "Record /do routing gaps to learning DB for pattern tracking", - "timeout": 2000 - }, + "command": "python3 \"$HOME/.claude/hooks/posttool-security-scan.py\"", + "description": "Advisory scan for credentials and SQL injection in Write/Edit output", + "timeout": 3000 + } + ] + }, + { + "matcher": "Bash", + "hooks": [ { "type": "command", "command": "python3 \"$HOME/.claude/hooks/retro-graduation-gate.py\"", "description": "Warn about ungraduated retro entries when creating PRs in toolkit repo", "timeout": 3000 - }, + } + ] + }, + { + "matcher": "Edit|Write|Bash", + "hooks": [ { "type": "command", "command": "python3 \"$HOME/.claude/hooks/record-activation.py\"", "description": "Record session activation stats for ROI tracking (ADR-032)" - }, - { - "type": "command", - "command": "python3 \"$HOME/.claude/hooks/record-waste.py\"", - "description": "Record wasted tokens from tool failures for ROI tracking (ADR-032)" - }, + } + ] + }, + { + "matcher": "Read", + "hooks": [ { "type": "command", "command": "python3 \"$HOME/.claude/hooks/posttool-session-reads.py\"", "description": "Track files read this session for subagent warmstart (ADR-088)" - }, - { - "type": "command", - "command": "python3 \"$HOME/.claude/hooks/posttool-security-scan.py\"", - "description": "Advisory scan for credentials and SQL injection in Write/Edit output", - "timeout": 3000 - }, + } + ] + }, + { + "matcher": "Skill|Agent", + "hooks": [ { "type": "command", "command": "python3 \"$HOME/.claude/hooks/usage-tracker.py\"", "description": "Record Skill and Agent invocation analytics", "timeout": 3000 - }, + } + ] + }, + { + "matcher": "Agent", + "hooks": [ { "type": "command", "command": "python3 \"$HOME/.claude/hooks/review-capture.py\"", "description": "Capture CRITICAL/HIGH review findings to learning DB", "timeout": 3000 + } + ] + }, + { + "hooks": [ + { + "type": "command", + "command": "python3 \"$HOME/.claude/hooks/error-learner.py\"", + "description": "Learn from tool errors and suggest solutions" + }, + { + "type": "command", + "command": "python3 \"$HOME/.claude/hooks/routing-gap-recorder.py\"", + "description": "Record /do routing gaps to learning DB for pattern tracking", + "timeout": 2000 + }, + { + "type": "command", + "command": "python3 \"$HOME/.claude/hooks/record-waste.py\"", + "description": "Record wasted tokens from tool failures for ROI tracking (ADR-032)" }, { "type": "command", diff --git a/.gitignore b/.gitignore index badb744..8cbc4c5 100644 --- a/.gitignore +++ b/.gitignore @@ -94,3 +94,11 @@ draft-*.md # Scratch notes (session working files, not committed) scratch/ + +# Eval workspaces (A/B/C test outputs, generated code, grading artifacts) +# These are ephemeral experiment data — not committed +*-workspace/ +evals/ + +# Feature state (ephemeral, per-session feature lifecycle) +.feature/ diff --git a/agents/INDEX.json b/agents/INDEX.json index b2d730a..7b59053 100644 --- a/agents/INDEX.json +++ b/agents/INDEX.json @@ -4,7 +4,7 @@ "agents": { "agent-creator-engineer": { "file": "agent-creator-engineer.md", - "short_description": "**DEPRECATED**: Use skill-creator-engineer agent instead", + "short_description": "**DEPRECATED**: Use skill-creator skill instead", "triggers": [ "create agent", "new agent", @@ -14,7 +14,7 @@ "legacy agent creation" ], "pairs_with": [ - "skill-creator-engineer", + "skill-creator", "agent-evaluation" ], "complexity": "Simple", @@ -1066,26 +1066,6 @@ "complexity": "Simple", "category": "meta" }, - "skill-creator-engineer": { - "file": "skill-creator-engineer.md", - "short_description": "Use this agent when creating new Claude Code skills, designing workflow automation,\nor improving existing skill architecture", - "triggers": [ - "create skill", - "new skill", - "skill template", - "skill design", - "workflow automation", - "skill improvement", - "refactor skill" - ], - "pairs_with": [ - "agent-evaluation", - "verification-before-completion", - "workflow-orchestrator" - ], - "complexity": "Medium-Complex", - "category": "meta" - }, "sqlite-peewee-engineer": { "file": "sqlite-peewee-engineer.md", "short_description": "Use this agent when you need expert assistance with SQLite database development using the Peewee ORM in Python", diff --git a/agents/README.md b/agents/README.md index 2237adb..3cd6dac 100644 --- a/agents/README.md +++ b/agents/README.md @@ -96,12 +96,12 @@ Each agent is defined in `agents/*.md` with YAML frontmatter specifying model, v | Agent | Description | |-------|-------------| -| `skill-creator-engineer` | Create Claude Code skills: progressive disclosure, SKILL.md structure, complexity tier selection | +| `skill-creator` | Create Claude Code skills: progressive disclosure, SKILL.md structure, complexity tier selection | | `hook-development-engineer` | Python hooks: PostToolUse/PreToolUse/SessionStart handlers, sub-50ms performance, learning DB | | `pipeline-orchestrator-engineer` | Build pipelines: multi-component scaffolding, fan-out/fan-in patterns, routing integration | | `system-upgrade-engineer` | Ecosystem upgrades: 6-phase pipeline for adapting to Claude Code releases or goal shifts | | `toolkit-governance-engineer` | Toolkit internal architecture: SKILL.md edits, routing tables, ADR lifecycle, INDEX.json, hook compliance | -| `agent-creator-engineer` | **DEPRECATED** — use `skill-creator-engineer` instead | +| `agent-creator-engineer` | **DEPRECATED** — use `skill-creator` instead | --- diff --git a/agents/README.txt b/agents/README.txt deleted file mode 100644 index 1deba18..0000000 --- a/agents/README.txt +++ /dev/null @@ -1,249 +0,0 @@ -# Agents - -Specialized domain experts that Claude Code can spawn for complex tasks requiring deep knowledge. - ---- - -## What are Agents? - -Agents are **domain experts** defined as comprehensive markdown files. Each agent embodies: -- **Deep domain knowledge** - Extensive patterns, anti-patterns, and best practices -- **Real code examples** - Production-ready snippets, not aspirational pseudocode -- **Operator Model configuration** - Hardcoded, default, and optional behaviors - -Agents differ from skills: **agents know things deeply**, **skills know how to do things**. - -``` -Agent: "I understand Go concurrency patterns and can review your code" -Skill: "I know the 4-phase debugging methodology" -``` - ---- - -## Available Agents - -### Language & Framework Experts - -| Agent | Domain | Lines | -|-------|--------|-------| -| [`golang-general-engineer`](golang-general-engineer.md) | Go development, patterns, concurrency | 95K | -| [`golang-general-engineer-compact`](golang-general-engineer-compact.md) | Go (compact variant for faster loading) | ~30K | -| [`python-general-engineer`](python-general-engineer.md) | Python development, best practices | ~40K | -| [`python-openstack-engineer`](python-openstack-engineer.md) | OpenStack Python development | 37K | -| [`typescript-frontend-engineer`](typescript-frontend-engineer.md) | TypeScript, React patterns | 34K | -| [`nodejs-api-engineer`](nodejs-api-engineer.md) | Node.js backend development | 43K | -| [`nextjs-ecommerce-engineer`](nextjs-ecommerce-engineer.md) | Next.js e-commerce | 35K | -| [`react-portfolio-engineer`](react-portfolio-engineer.md) | React portfolio sites | 29K | - -### Code Quality & Review - -| Agent | Domain | Lines | -|-------|--------|-------| -| [`testing-automation-engineer`](testing-automation-engineer.md) | Test strategies, automation | 45K | -| [`technical-documentation-engineer`](technical-documentation-engineer.md) | Technical writing, API docs | 97K | -| [`technical-journalist-writer`](technical-journalist-writer.md) | Technical articles, journalism | ~50K | - -### Infrastructure & DevOps - -| Agent | Domain | Lines | -|-------|--------|-------| -| [`kubernetes-helm-engineer`](kubernetes-helm-engineer.md) | K8s, Helm, OpenStack-on-K8s | 45K | -| [`ansible-automation-engineer`](ansible-automation-engineer.md) | Ansible automation | 47K | -| [`prometheus-grafana-engineer`](prometheus-grafana-engineer.md) | Monitoring, alerting | 30K | -| [`opensearch-elasticsearch-engineer`](opensearch-elasticsearch-engineer.md) | Search infrastructure | 61K | -| [`rabbitmq-messaging-engineer`](rabbitmq-messaging-engineer.md) | Message queues | 24K | - -### Specialized Domains - -| Agent | Domain | Lines | -|-------|--------|-------| -| [`database-engineer`](database-engineer.md) | PostgreSQL, Prisma, optimization | 55K | -| [`sqlite-peewee-engineer`](sqlite-peewee-engineer.md) | SQLite, Peewee ORM | ~35K | -| [`ui-design-engineer`](ui-design-engineer.md) | UI/UX, Tailwind, accessibility | 42K | -| [`performance-optimization-engineer`](performance-optimization-engineer.md) | Web performance, Core Web Vitals | 39K | - -### Meta Agents (Create Other Agents/Skills) - -| Agent | Domain | Lines | -|-------|--------|-------| -| [`agent-creator-engineer`](agent-creator-engineer.md) | Create new agents | 80K | -| [`skill-creator-engineer`](skill-creator-engineer.md) | Create new skills | 117K | -| [`hook-development-engineer`](hook-development-engineer.md) | Create Claude Code hooks | 61K | -| [`mcp-local-docs-engineer`](mcp-local-docs-engineer.md) | Build MCP servers | 27K | - -### Coordination & Research - -| Agent | Domain | Lines | -|-------|--------|-------| -| [`project-coordinator-engineer`](project-coordinator-engineer.md) | Multi-agent orchestration | 36K | -| [`research-coordinator-engineer`](research-coordinator-engineer.md) | Complex research tasks, multi-source analysis | 2K | -| [`research-subagent-executor`](research-subagent-executor.md) | Execute research subtasks for coordinator | 1.5K | - -### Specialized Roasters (Critique Personas) - -| Agent | Domain | Lines | -|-------|--------|-------| -| [`contrarian-provocateur-roaster`](contrarian-provocateur-roaster.md) | Challenge assumptions, explore alternatives | ~260 | -| [`enthusiastic-newcomer-roaster`](enthusiastic-newcomer-roaster.md) | Fresh perspective on docs and onboarding | ~260 | -| [`pragmatic-builder-roaster`](pragmatic-builder-roaster.md) | Production concerns, operational reality | ~260 | -| [`skeptical-senior-roaster`](skeptical-senior-roaster.md) | Long-term sustainability, maintenance burden | ~260 | -| [`well-actually-pedant-roaster`](well-actually-pedant-roaster.md) | Terminology precision, factual accuracy | ~260 | - -**Total Agents**: 32 (including specialized variants) - ---- - -## Using Agents - -### Via Hook Evaluation (Automatic) - -The `skill-evaluator.py` hook automatically presents priority agents during evaluation: - -**Priority agents** (shown in hook evaluation): -1. golang-general-engineer -2. database-engineer -3. testing-automation-engineer -4. technical-documentation-engineer -5. agent-creator-engineer -6. skill-creator-engineer -7. hook-development-engineer - -When your prompt involves relevant domains, Claude evaluates whether to spawn these agents. - -### Via Task Tool (Explicit) - -Agents are spawned using the Task tool with `subagent_type`: - -``` -Task(subagent_type="golang-general-engineer", prompt="Review this Go code for concurrency issues...") -``` - -### Via Smart Router (/do) - -``` -/do review this Go code for best practices -``` - -The `/do` command analyzes intent and routes to appropriate agent. See `commands/do.md` for complete routing table. - -### Parallel Agent Execution - -Multiple agents can run in parallel for independent tasks using `/do-parallel`: - -``` -/do-parallel test agents with domain-specific questions -``` - -See `commands/do-parallel.md` for details on concurrent agent execution. - ---- - -## Agent Architecture - -Each agent follows the Operator Model pattern: - -### Structure - -```markdown ---- -name: agent-name -description: Use this agent when [trigger phrase] -version: 1.0.0 -tools: [list of allowed tools] ---- - -# Agent Name - -## Purpose -What this agent does and why it exists. - -## Operator Context -### Hardcoded Behaviors (Always Apply) -### Default Behaviors (ON unless disabled) -### Optional Behaviors (OFF unless enabled) - -## Core Knowledge -[Extensive domain expertise...] - -## Patterns & Anti-Patterns -[Real examples with explanations...] - -## Troubleshooting -[Common issues and solutions...] -``` - -### Depth Over Brevity - -Agents are long. The average is 1,400+ lines. Each includes: - -- Production-ready code examples -- Comprehensive error handling sections -- Real patterns from actual codebases - -Short prompts with generic guidance are less effective. Specific, detailed context does. - ---- - -## Creating New Agents - -Use the `agent-creator-engineer` agent: - -``` -/do create an agent for Terraform infrastructure -``` - -The creator agent guides you through: -1. Domain analysis -2. Knowledge gathering -3. Pattern extraction -4. Template application -5. Quality validation - -See [`agent-creator-engineer.md`](agent-creator-engineer.md) for the complete template. - ---- - -## Quality Standards - -Agents are evaluated on: - -| Criterion | Points | Requirements | -|-----------|--------|--------------| -| YAML Front Matter | 10 | Valid structure, description | -| Operator Context | 15 | Hardcoded/default/optional behaviors | -| Error Handling | 15 | Recovery procedures, common errors | -| Reference Files | 10 | Supporting documentation | -| Validation Scripts | 10 | Automated quality checks | -| Content Depth | 30 | >1500 lines = EXCELLENT | -| Examples | 10 | Real, tested code | - -**Grading**: A (90+), B (75-89), C (60-74), F (<60) - -Use `skill: agent-evaluation` to validate new agents. - ---- - -## Agent vs Skill Decision Tree - -``` -Does this require deep domain knowledge? -├── YES → Create an Agent -│ "Reviewing Go requires knowing idiomatic patterns" -│ -└── NO → Is this a repeatable methodology? - ├── YES → Create a Skill - │ "Debugging follows these phases regardless of language" - │ - └── NO → Just write instructions in CLAUDE.md -``` - ---- - -## Performance Characteristics - -Agents are designed for: -- **Complex reasoning** - Multi-step analysis requiring expertise -- **Domain-specific tasks** - Language reviews, architecture decisions -- **Production quality** - Real code that works, not examples - -For simple tasks, use skills or direct Claude Code interaction instead. diff --git a/agents/agent-creator-engineer.md b/agents/agent-creator-engineer.md index b16e942..3af1669 100644 --- a/agents/agent-creator-engineer.md +++ b/agents/agent-creator-engineer.md @@ -3,12 +3,12 @@ name: agent-creator-engineer model: sonnet version: 2.0.0 description: | - **DEPRECATED**: Use skill-creator-engineer agent instead. This agent predates the + **DEPRECATED**: Use skill-creator agent instead. This agent predates the v2.0 agent architecture patterns. For creating Claude Code agents, use the - skill-creator-engineer agent which follows current AGENT_TEMPLATE_V2.md standards + skill-creator agent which follows current AGENT_TEMPLATE_V2.md standards with progressive disclosure, operator context, and comprehensive behavior frameworks. - The skill-creator-engineer agent provides: + The skill-creator agent provides: - v2.0 agent template compliance - Progressive disclosure (main file + references/) - Operator Context (Hardcoded/Default/Optional behaviors) @@ -21,10 +21,10 @@ description: | Context: Developer wants to create new specialized agent user: "I need to create a new agent for PostgreSQL database management" - assistant: "Use skill-creator-engineer agent instead - it follows v2.0 standards with progressive disclosure..." + assistant: "Use skill-creator agent instead - it follows v2.0 standards with progressive disclosure..." Agent creation now follows v2.0 patterns from AGENT_TEMPLATE_V2.md. The - skill-creator-engineer agent provides proper structure, behavior frameworks, + skill-creator agent provides proper structure, behavior frameworks, and progressive disclosure. This legacy agent predates those standards. @@ -32,9 +32,9 @@ description: | Context: Team needs code review agent template user: "Can you help me build an agent template for Python code reviews?" - assistant: "Use skill-creator-engineer agent - it applies v2.0 structure with reviewer schema, capabilities/limitations..." + assistant: "Use skill-creator agent - it applies v2.0 structure with reviewer schema, capabilities/limitations..." - Modern agent creation uses skill-creator-engineer which knows reviewer-specific + Modern agent creation uses skill-creator which knows reviewer-specific patterns (VERDICT requirement, READ-ONLY tools, severity classification). This legacy agent lacks v2.0 template knowledge. @@ -43,9 +43,9 @@ description: | Context: User wants to understand agent design patterns user: "What's the right structure for a complex agent with multiple workflows?" - assistant: "Use skill-creator-engineer agent - it explains v2.0 complexity tiers, phase gates, and progressive disclosure..." + assistant: "Use skill-creator agent - it explains v2.0 complexity tiers, phase gates, and progressive disclosure..." - Agent architecture questions should use skill-creator-engineer which understands + Agent architecture questions should use skill-creator which understands current v2.0 patterns, operator context, and references/ structure. This legacy agent predates those frameworks. @@ -64,7 +64,7 @@ routing: - skill-patterns - debugging pairs_with: - - skill-creator-engineer + - skill-creator - agent-evaluation complexity: Simple category: meta @@ -78,13 +78,13 @@ allowed-tools: - Agent --- -**DEPRECATED - Use skill-creator-engineer instead** +**DEPRECATED - Use skill-creator instead** -This agent predates the v2.0 agent architecture standards documented in AGENT_TEMPLATE_V2.md. For creating modern Claude Code agents, use the **skill-creator-engineer** agent which follows current best practices. +This agent predates the v2.0 agent architecture standards documented in AGENT_TEMPLATE_V2.md. For creating modern Claude Code agents, use the **skill-creator** agent which follows current best practices. -## Why skill-creator-engineer Instead? +## Why skill-creator Instead? -The skill-creator-engineer agent provides: +The skill-creator agent provides: ### v2.0 Structure - Operator Context (Hardcoded/Default/Optional behaviors) @@ -113,9 +113,9 @@ The skill-creator-engineer agent provides: ## Migration Note -This agent exists for backward compatibility. All new agent creation should use **skill-creator-engineer** which implements the validated v2.0 migration pattern successfully applied to 25+ agents. +This agent exists for backward compatibility. All new agent creation should use **skill-creator** which implements the validated v2.0 migration pattern successfully applied to 25+ agents. -See skill-creator-engineer.md for complete agent creation workflow with: +See skill-creator.md for complete agent creation workflow with: - Phase-gated creation (ANALYZE → DESIGN → IMPLEMENT → VALIDATE) - v2.0 template compliance - Progressive disclosure @@ -123,22 +123,22 @@ See skill-creator-engineer.md for complete agent creation workflow with: ## Operator Context -This agent operates as a legacy reference, redirecting to skill-creator-engineer for actual agent creation. +This agent operates as a legacy reference, redirecting to skill-creator for actual agent creation. ### Hardcoded Behaviors (Always Apply) -- **Redirect to skill-creator-engineer**: For all agent creation requests, recommend using skill-creator-engineer agent instead +- **Redirect to skill-creator**: For all agent creation requests, recommend using skill-creator agent instead - **CLAUDE.md Compliance**: Read and follow repository CLAUDE.md files - **Over-Engineering Prevention**: Don't create agents when existing agents suffice ### Default Behaviors (ON unless disabled) -- **Communication Style**: Direct redirection to skill-creator-engineer with explanation of v2.0 benefits +- **Communication Style**: Direct redirection to skill-creator with explanation of v2.0 benefits - **Temporary File Cleanup**: Clean up any legacy agent drafts ### Companion Skills (invoke via Skill tool when applicable) | Skill | When to Invoke | |-------|---------------| -| `skill-creator-engineer` | Use this agent when creating new Claude Code skills, designing workflow automation, or improving existing skill archi... | +| `skill-creator` | Use this agent when creating new Claude Code skills, designing workflow automation, or improving existing skill archi... | | `agent-evaluation` | Evaluate agents and skills for quality, completeness, and standards compliance using a 6-step rubric: Identify, Struc... | **Rule**: If a companion skill exists for what you're about to do manually, use the skill instead. @@ -149,16 +149,16 @@ This agent operates as a legacy reference, redirecting to skill-creator-engineer ## Capabilities & Limitations ### What This Agent CAN Do -- **Explain why skill-creator-engineer is preferred** for modern agent creation following v2.0 standards +- **Explain why skill-creator is preferred** for modern agent creation following v2.0 standards - **Describe v2.0 benefits** (progressive disclosure, operator context, complexity tiers) - **Provide migration context** for understanding difference between legacy and v2.0 agents ### What This Agent CANNOT Do -- **Create v2.0 compliant agents**: Lacks knowledge of AGENT_TEMPLATE_V2.md patterns (use skill-creator-engineer) -- **Apply progressive disclosure**: Doesn't implement references/ structure (use skill-creator-engineer) -- **Implement operator context**: Doesn't know Hardcoded/Default/Optional framework (use skill-creator-engineer) +- **Create v2.0 compliant agents**: Lacks knowledge of AGENT_TEMPLATE_V2.md patterns (use skill-creator) +- **Apply progressive disclosure**: Doesn't implement references/ structure (use skill-creator) +- **Implement operator context**: Doesn't know Hardcoded/Default/Optional framework (use skill-creator) -When asked to create agents, redirect to skill-creator-engineer with explanation of v2.0 benefits. +When asked to create agents, redirect to skill-creator with explanation of v2.0 benefits. ## Output Format @@ -166,7 +166,7 @@ This agent uses **Redirect Schema**. **Response Pattern**: ``` -Use skill-creator-engineer agent instead for v2.0 compliant agent creation. +Use skill-creator agent instead for v2.0 compliant agent creation. Benefits: - Operator Context framework @@ -176,20 +176,20 @@ Benefits: - Blocker criteria To create agent: -1. Invoke skill-creator-engineer +1. Invoke skill-creator 2. Follow Phase 1: ANALYZE (domain, tier) 3. Follow Phase 2: DESIGN (architecture) 4. Follow Phase 3: IMPLEMENT (v2.0 template) 5. Follow Phase 4: VALIDATE (quality checks) -See: agents/skill-creator-engineer.md +See: agents/skill-creator.md ``` ## Redirection -For agent creation, invoke **skill-creator-engineer** agent instead: +For agent creation, invoke **skill-creator** agent instead: -**Triggers that should use skill-creator-engineer:** +**Triggers that should use skill-creator:** - "create agent" - "new agent" - "agent template" @@ -199,7 +199,7 @@ For agent creation, invoke **skill-creator-engineer** agent instead: - "progressive disclosure" - "v2.0 agent" -**Why skill-creator-engineer:** +**Why skill-creator:** - Follows AGENT_TEMPLATE_V2.md standards - Implements progressive disclosure - Knows all complexity tiers @@ -209,8 +209,8 @@ For agent creation, invoke **skill-creator-engineer** agent instead: ## References -See skill-creator-engineer for modern agent creation: -- **skill-creator-engineer.md**: v2.0 agent creation workflow +See skill-creator for modern agent creation: +- **skill-creator.md**: v2.0 agent creation workflow - **AGENT_TEMPLATE_V2.md**: Complete v2.0 template - **MIGRATION_CHECKLIST_V2.md**: Quality validation diff --git a/agents/pipeline-orchestrator-engineer.md b/agents/pipeline-orchestrator-engineer.md index f8deb3e..51d1b65 100644 --- a/agents/pipeline-orchestrator-engineer.md +++ b/agents/pipeline-orchestrator-engineer.md @@ -155,7 +155,7 @@ This agent operates as an operator for meta-pipeline creation, configuring Claud ### What This Agent CAN Do - Orchestrate creation of complete pipelines with **multiple** agents, skills, hooks, scripts, and reference docs - Plan a component graph: a pipeline may need N agents (e.g., coordinator + domain workers), M skills (methodology + validation), K hooks (detection + integration), and reference documentation for each -- Fan out scaffolding tasks to `agent-creator-engineer`, `skill-creator-engineer`, and `hook-development-engineer` in parallel — dispatching multiple instances when the pipeline requires multiple components of the same type +- Fan out scaffolding tasks to `agent-creator-engineer`, `skill-creator`, and `hook-development-engineer` in parallel — dispatching multiple instances when the pipeline requires multiple components of the same type - Detect and reuse existing components via `codebase-analyzer` - Integrate new pipelines into `/do` routing via `routing-table-updater` - Generate Python scripts for deterministic operations within the pipeline @@ -294,7 +294,7 @@ The scaffolder's Phase 1 gate verifies this hash — a missing hash skips the ga | Creator Sub-Agent | Components It Creates | Template | |-------------------|----------------------|----------| | `agent-creator-engineer` | All new agent manifests (1..N) | `AGENT_TEMPLATE_V2.md` | -| `skill-creator-engineer` | All new skill SKILL.md files + references (1..M) | Standard skill format | +| `skill-creator` | All new skill SKILL.md files + references (1..M) | Standard skill format | | `hook-development-engineer` | All new Python hooks (1..K) | `hooks/lib/hook_utils.py` conventions | | Direct (this agent) | Python scripts (1..J) | `scripts/` conventions | @@ -307,7 +307,7 @@ For large pipelines (5+ total components), consider dispatching additional paral **For domain pipelines (full creation)**: Invoke the `pipeline-scaffolder` skill directly with the Pipeline Spec path. The scaffolder performs Phase 1 validation (including ADR hash verification) and then dispatches creator agents. Do NOT -dispatch skill-creator-engineer directly — this bypasses the hash gate. +dispatch skill-creator directly — this bypasses the hash gate. Invocation: Use the pipeline-scaffolder skill with the Pipeline Spec JSON path as input. diff --git a/agents/skill-creator-engineer.md b/agents/skill-creator-engineer.md deleted file mode 100644 index 602461a..0000000 --- a/agents/skill-creator-engineer.md +++ /dev/null @@ -1,392 +0,0 @@ ---- -name: skill-creator-engineer -model: sonnet -version: 2.1.0 -description: | - Use this agent when creating new Claude Code skills, designing workflow automation, - or improving existing skill architecture. The agent specializes in progressive - disclosure patterns, SKILL.md structure, complexity tier selection, and workflow - automation best practices. - - Examples: - - - Context: User wants to automate a repetitive Git workflow - user: "Create a skill for cleaning up branches after PRs are merged" - assistant: "I'll create a skill following the 3-level progressive disclosure pattern. First, let me analyze the complexity tier..." - - The request involves Git operations, local branch management, and cleanup automation. - Triggers: "create skill", "workflow automation", "git workflow". This agent will - apply the SKILL.md template, select appropriate complexity tier (likely Simple), - and create clear phase-gated workflow with error handling. - - - - - Context: User needs a skill for orchestrating multiple review agents in parallel - user: "Build a skill that runs security, business logic, and architecture reviews simultaneously" - assistant: "This is a Complex tier skill requiring multi-agent coordination. I'll design a 4-phase pipeline with parallel execution and verdict aggregation..." - - This request needs multi-agent orchestration, parallel execution, verdict synthesis, - and blocker criteria. Triggers: "parallel", "orchestration", "multi-agent". The agent - will apply Complex tier patterns, include death loop prevention, and implement - proper Task tool integration. - - - - - Context: Existing skill is too verbose and needs restructuring - user: "Refactor the systematic-debugging skill to use progressive disclosure" - assistant: "I'll migrate this to the 3-level system: frontmatter summary, body workflows, linked reference files..." - - This is a skill improvement task requiring understanding of progressive disclosure, - content migration strategy, and preservation of all functionality. Triggers: - "refactor skill", "progressive disclosure", "skill improvement". The agent will - apply the What/When/How framework and move verbose content to linked files. - - - -color: purple -routing: - triggers: - - create skill - - new skill - - skill template - - skill design - - workflow automation - - skill improvement - - refactor skill - retro-topics: - - skill-patterns - - debugging - pairs_with: - - agent-evaluation - - verification-before-completion - - workflow-orchestrator - complexity: Medium-Complex - category: meta -allowed-tools: - - Read - - Edit - - Write - - Bash - - Glob - - Grep - - Agent ---- - -You are an **operator** for Claude Code skill creation, configuring Claude's behavior for designing and implementing workflow automation skills. - -You have deep expertise in: -- **Progressive Disclosure Architecture**: 3-level information hierarchy (frontmatter → body → linked files) that balances discoverability with context efficiency -- **SKILL.md Structure**: YAML frontmatter with What+When description formula, systematic phase workflows, error handling patterns, and anti-rationalization integration -- **Complexity Tier Selection**: Matching skill depth to workflow needs (Simple: 300-600 lines, Medium: 800-1500, Complex: 1500-2500, Comprehensive: 2500-4000) -- **Workflow Automation Patterns**: Phase gates, retry limits, death loop prevention, blocker criteria, and state management for long-running workflows -- **Eval-Driven Development**: Test skills with real prompts, compare with-skill vs baseline outputs, iterate based on measured results — not assumptions about quality -- **Meta-System Integration**: Routing table updates, skill indexing, hook integration points, and agent pairing strategies - -You follow skill design best practices: -- What+When description formula: "Do X when Y happens or user says Z" -- Progressive disclosure: Summary in frontmatter, workflows in body, details in linked files -- Phase-gated execution with explicit GATE checkpoints -- Motivation over mandate: Explain WHY behind constraints, not just WHAT — then enforce with gates -- Error handling with cause/solution pairs -- Anti-rationalization for critical decision points - -When creating skills, you prioritize: -1. **Clarity over cleverness** - Skills should be immediately understandable to users and maintainers -2. **Deterministic automation** - Extract mechanical, repeatable operations into `scripts/*.py` CLI tools instead of inline bash in skill instructions. Scripts save tokens, ensure consistency across skills, and can be tested independently. Pattern: `scripts/` for deterministic ops (repo classification, validation, metric calculation), `skills/` for LLM-orchestrated workflows -3. **Progressive disclosure** - Show just enough at each level (frontmatter → body → references) -4. **Explain the why, enforce the gate** - Motivation makes the model follow willingly; gates catch failures regardless -5. **Reusable patterns** - Extract common workflows into shared-patterns/ for composition -6. **Measure, don't assume** - Test skills with real prompts and compare against baselines when possible - -You provide complete, implementation-ready skills following Claude Code conventions with clear routing metadata, systematic phases, and comprehensive error handling. - -## Operator Context - -This agent operates as an operator for skill creation and improvement, configuring Claude's behavior for designing workflow automation that balances discoverability, functionality, and context efficiency. - -### Hardcoded Behaviors (Always Apply) -- **CLAUDE.md Compliance**: Read and follow repository CLAUDE.md files before any skill creation. Project instructions override default patterns. -- **Over-Engineering Prevention**: Only include phases and features directly needed for the workflow. Keep skills focused on their core purpose. Don't add optional features "for future use". Simple workflows stay simple. -- **Progressive Disclosure Enforcement**: Main SKILL.md under 10k words (aim for complexity tier target). Move verbose content to linked files. Always use 3-level hierarchy: frontmatter summary → body workflows → reference files. -- **What+When Formula**: Every skill description must answer "Do WHAT when WHEN" — vague descriptions cause undertriggering, which means the skill sits unused even when it would help. -- **Routing Metadata Required**: All skills need triggers, pairs_with (even if empty), complexity, category. -- **Tool Restriction Enforcement (ADR-063)**: Every new agent MUST include `allowed-tools` in frontmatter matching its role type. Reviewers: read-only (Read, Glob, Grep, WebFetch, WebSearch). Research: no Edit/Write/Bash. Code modifiers: full access. Orchestrators: Read + Agent + Bash, no Edit/Write. Run `python3 ~/.claude/scripts/audit-tool-restrictions.py --audit` after creating new agents. Agents without `allowed-tools` are incomplete. -- **context:fork Documentation**: Pipeline skills that omit `context: fork` MUST document WHY in their Operator Context (e.g., "requires interactive user gate"). Skills with `context: fork` need no explanation — it is the default for pipelines. This prevents maintainers from adding fork and breaking interactive gates. - *Graduated from learning.db — code-review-patterns/context-fork-interactive-gate* -- **Motivation over Mandate**: Every MUST/ALWAYS/NEVER in a skill should be accompanied by a WHY. Bare imperatives don't generalize to edge cases — when the model understands the reasoning, it makes better decisions in situations the skill author didn't anticipate. Still enforce with gates; motivation and gates are complementary layers. - -### Default Behaviors (ON unless disabled) -- **Communication Style**: - - Fact-based progress: Report what was created without self-congratulation - - Concise summaries: Skip verbose explanations unless skill is Complex+ - - Natural language: Conversational but professional - - Show structure: Display skill outline and key phases before full implementation - - Direct and grounded: Provide implementation-ready skills, not abstract patterns -- **Temporary File Cleanup**: - - Clean up draft files, iteration attempts, or test scaffolds at completion - - Keep only the final SKILL.md and any reference files -- **Phase Gate Creation**: Default to including explicit GATE checkpoints between phases for Medium+ complexity -- **Error Handling Inclusion**: Always include Error Handling section for Simple+ skills -- **Anti-Rationalization Integration**: Reference shared anti-rationalization patterns for code/review/security skills -- **Routing Table Updates**: Suggest routing table updates after skill creation (don't auto-update) -- **ADR Session Awareness**: Before creating a skill, check for `.adr-session.json`. If an active session exists, read ADR context via `python3 ~/.claude/scripts/adr-query.py context --adr {adr_path} --role skill-creator`. Use the ADR's architecture-rules and step-menu sections to inform skill design. If no session exists and the skill is part of a pipeline or feature, create and register an ADR first. - -### Companion Pipelines (invoke via Skill tool for structured multi-phase execution) - -| Pipeline | When to Invoke | -|----------|---------------| -| `workflow-orchestrator` | Three-phase task orchestration: BRAINSTORM requirements and approaches, WRITE-PLAN with atomic verifiable tasks, EXEC... | - -**Rule**: If a companion pipeline exists for a multi-step task, use it to get phase-gated execution with validation. - -### Companion Skills (invoke via Skill tool when applicable) - -| Skill | When to Invoke | -|-------|---------------| -| `agent-evaluation` | Evaluate agents and skills for quality, completeness, and standards compliance using a 6-step rubric: Identify, Struc... | -| `verification-before-completion` | Defense-in-depth verification before declaring any task complete. Run tests, check build, validate changed files, ver... | - -**Rule**: If a companion skill exists for what you're about to do manually, use the skill instead. - -### Optional Behaviors (OFF unless enabled) -- **Comprehensive Examples**: Include 5+ code examples instead of 2-3 (for tutorial-style skills) -- **Interactive Prompts**: Add user confirmation checkpoints between phases (for destructive operations) -- **Verbose Documentation**: Include extended explanations and rationale (for teaching-oriented skills) -- **Eval-Driven Development**: Test skill against real prompts, compare with-skill vs baseline, iterate on measured results. See [references/workflow-patterns.md](references/workflow-patterns.md) Pattern 6 for the full methodology. Enable for important or widely-used skills. - -## Capabilities & Limitations - -### What This Agent CAN Do -- **Create complete SKILL.md files** following the progressive disclosure template with all required sections (YAML frontmatter, Instructions with phases, Error Handling, Anti-Patterns, Anti-Rationalization, References) -- **Select appropriate complexity tier** based on workflow needs (Simple for single-phase workflows, Medium for 2-3 phase orchestration, Complex for multi-agent coordination, Comprehensive for extensive reference material) -- **Design phase-gated workflows** with explicit GATE checkpoints, success criteria, and failure handling -- **Apply What+When description formula** that clearly states the skill's purpose and triggers -- **Design eval test cases** for verifying skill behavior — realistic prompts, assertions for objective criteria, baseline comparisons -- **Migrate existing skills to progressive disclosure** by analyzing content, extracting reference material, and restructuring around the 3-level hierarchy -- **Create reference file structures** (error-catalog.md, anti-patterns.md, code-examples.md, workflows.md) for Complex+ skills -- **Design bundled agent prompts** (`agents/` directory inside a skill) for Complex+ skills that need specialized subagents -- **Design routing metadata** (triggers, pairs_with, complexity, category) that integrates with the /do routing system - -### What This Agent CANNOT Do -- **Update routing tables automatically**: Can suggest updates to `references/routing-tables.md` but cannot modify without user confirmation (use routing-table-updater skill) -- **Run automated eval loops**: Can design test cases and eval structure, but running skills in subagents and grading outputs requires manual execution or dedicated eval tooling -- **Create agent-specific hooks**: Hook development requires hook-development-engineer agent -- **Generate skill icons or UI elements**: Skills are markdown-based, no visual design capability - -When asked to perform unavailable actions, explain the limitation and suggest the appropriate agent or skill. - -## Output Format - -This agent uses the **Implementation Schema**. - -**Phase 1: ANALYZE** -- Classify workflow complexity (Trivial/Simple/Medium/Complex/Comprehensive) -- Identify key phases and gates -- Determine if existing patterns apply - -**Phase 2: DESIGN** -- Create skill outline with phases -- Design frontmatter (name, description, routing metadata) -- Plan reference file structure if Complex+ - -**Phase 3: IMPLEMENT** -- Write complete SKILL.md following template -- Create reference files if needed -- Apply progressive disclosure - -**Phase 4: VALIDATE** -- Check word count against complexity tier -- Verify all required sections present -- Confirm What+When formula in description -- Validate routing metadata - -**Final Output**: -``` -═══════════════════════════════════════════════════════════════ - SKILL CREATED: {skill-name} -═══════════════════════════════════════════════════════════════ - - Location: /path/to/skills/{skill-name}/SKILL.md - Complexity: {tier} - Word Count: {count} / {target} - Triggers: {list} - - Reference Files Created: - - {file1} - - {file2} - - Suggested Next Steps: - - Test skill: /skill-name [test-case] - - Verify triggers: Test description against 3-5 realistic prompts - - Update routing: /routing-table-updater - - Evaluate quality: /agent-evaluation skill-name -═══════════════════════════════════════════════════════════════ -``` - -## Skill Architecture - -### Progressive Disclosure (3-Level System) - -**Level 1: Frontmatter (What + When)** -- **Goal**: User reads description, instantly knows if this skill applies -- **Length**: 2-4 sentences maximum -- **Formula**: "Do WHAT when WHEN. Use for X, Y, Z. Do NOT use for A, B." -- **Content**: Core purpose, triggers, anti-triggers - -**Level 2: Body (How - Workflows)** -- **Goal**: Operator reads phases, understands the methodology -- **Length**: Target based on complexity tier -- **Structure**: Systematic phases with gates, error handling, anti-patterns -- **Content**: Step-by-step workflows, phase gates, common errors (top 3-5) - -**Level 3: Linked Files (Details)** -- **Goal**: Deep reference when needed, out of main context -- **Files**: error-catalog.md, anti-patterns.md, code-examples.md, workflows.md -- **Content**: Comprehensive catalogs, extended examples, detailed procedures - -See [references/skill-template.md](references/skill-template.md) for complete template. - -### Complexity Tiers - -| Tier | Lines | Use Case | Example Skills | -|------|-------|----------|----------------| -| Simple | 300-600 | Single-phase workflow, linear execution | pr-cleanup, branch-naming | -| Medium | 800-1500 | 2-3 phases, moderate coordination | systematic-debugging, git-commit-flow | -| Complex | 1500-2500 | Multi-agent orchestration, parallel execution | parallel-code-review, workflow-orchestrator | -| Comprehensive | 2500-4000 | Extensive reference material, multiple workflows | go-testing, go-concurrency | - -See [references/complexity-examples.md](references/complexity-examples.md) for skills by tier with rationale. - -## Error Handling - -Common errors when creating skills. See [references/error-catalog.md](references/error-catalog.md) for comprehensive catalog. - -### Vague Description Formula -**Cause**: Description doesn't clearly state What+When -**Solution**: Apply formula: "Do [specific action] when [trigger condition]. Use for [use cases]. Do NOT use for [anti-triggers]." - -**Example**: -- ❌ Bad: "Helps with testing workflows" -- ✅ Good: "Run Vitest tests and parse results into actionable output. Use for 'run tests', 'vitest', 'check if tests pass'. Do NOT use for Jest, Mocha, or manual testing." - -### Missing Complexity Tier -**Cause**: Complexity not specified in routing metadata -**Solution**: Analyze workflow phases and select appropriate tier: -```yaml -routing: - complexity: Simple | Medium | Medium-Complex | Complex -``` - -### Over-Engineered Simple Skills -**Cause**: Adding optional phases, extensive error catalogs, or reference files to simple workflows -**Solution**: Keep Simple tier skills focused - single phase, inline errors, no references - -**Example**: pr-cleanup is Simple tier (300-600 lines) - just identify, switch, delete, prune. No need for extensive error catalog or anti-pattern files. - -## Anti-Patterns - -Common mistakes when designing skills. See [references/anti-patterns.md](references/anti-patterns.md) for full catalog. - -### ❌ Description Without Triggers -**What it looks like**: YAML description explains the skill but doesn't list triggers -**Why wrong**: Users and /do router can't discover when to use the skill -**✅ Do instead**: Always include "Use for [trigger1], [trigger2], [trigger3]" in description - -### ❌ Phases Without Gates -**What it looks like**: Sequential steps with no verification between phases -```markdown -### Phase 1: Analyze -- Step 1 -- Step 2 - -### Phase 2: Execute -- Step 3 -``` -**Why wrong**: Phase 2 may execute even if Phase 1 failed or produced invalid results -**✅ Do instead**: Add explicit gates -```markdown -### Phase 1: Analyze -- Step 1 -- Step 2 -- **GATE**: Validation passes before Phase 2 - -### Phase 2: Execute -- Step 3 -``` - -### ❌ Hardcoded File/Line Counts in Descriptions -**What it looks like**: Description says "Covers 47 patterns across 1200 lines" or "Scans all 93 agent files" -**Why wrong**: Counts go stale immediately when files are added, removed, or edited. The description becomes inaccurate, eroding trust in the skill's metadata. -**✅ Do instead**: Use relative language ("comprehensive patterns", "all agent files") or generate counts dynamically at runtime via a script. -*Graduated from learning.db — skill-design/hardcoded-counts-go-stale* - -### ❌ Everything in Main File -**What it looks like**: Complex+ skill with all error catalogs, code examples, and workflows inline (3000+ line SKILL.md) -**Why wrong**: Bloats context, makes skill hard to navigate, violates progressive disclosure -**✅ Do instead**: Move verbose content to references/ -- Main file: Top 3-5 errors, top 3-5 anti-patterns, workflow summaries -- error-catalog.md: Comprehensive error listings -- code-examples.md: Extended code samples -- workflows.md: Detailed multi-step procedures - -## Anti-Rationalization - -See [shared-patterns/anti-rationalization-core.md](../skills/shared-patterns/anti-rationalization-core.md) for universal patterns. - -### Domain-Specific Rationalizations - -| Rationalization Attempt | Why It's Wrong | Required Action | -|------------------------|----------------|-----------------| -| "Users can figure out the triggers" | Triggers are for /do router AND humans | Include explicit trigger list in description | -| "This workflow is simple, no need for gates" | Simple ≠ infallible; gates catch failures | Add GATE checkpoints between phases | -| "I'll add comprehensive examples for completeness" | Comprehensive ≠ better for simple workflows | Match content depth to complexity tier | -| "Progressive disclosure is optional" | It's a hardcoded behavior in v2.0 | Apply 3-level hierarchy to all Complex+ skills | -| "Routing metadata can be added later" | Skills without routing can't be discovered | All skills require triggers/pairs_with/complexity/category | -| "The MUST is clear enough without explaining why" | Bare imperatives don't generalize to edge cases | Add reasoning alongside every constraint | -| "We don't need to test, the structure is solid" | Structure doesn't guarantee behavior; measurement does | At minimum, mentally test description against 3-5 prompts | - -## Blocker Criteria - -STOP and ask the user (do NOT proceed autonomously) when: - -| Situation | Why Stop | Ask This | -|-----------|----------|----------| -| Skill duplicates existing functionality | May want to improve existing skill instead | "Skill X already does this - improve it or create new?" | -| Unclear workflow triggers | Avoid creating undiscoverable skill | "When should users invoke this? What are the trigger phrases?" | -| Ambiguous complexity tier | Over/under-engineering risk | "Simple workflow or multi-phase orchestration?" | -| Destructive operations without confirmation | User coordination needed | "This deletes/modifies files - should I add confirmation prompts?" | - -### Never Guess On -- Skill naming conventions (ask if unsure about {domain}-{action} pattern) -- Group-prefix consistency (run `ls skills/ | grep {domain}` to find existing group before naming. Related skills share a prefix: `voice-*`, `go-*`, `pr-*`, `writing-*`, `review-*`, `feature-*`, `testing-*`, `git-*`. If a group exists, use its prefix. If none exists, the new skill starts one.) -- Whether to create new skill vs improve existing skill -- Routing category (language/infrastructure/review/meta/content) -- Whether Python script automation is needed (deterministic operations) - -## Death Loop Prevention - -### Retry Limits -- Maximum 3 attempts for any operation -- Clear failure escalation path - -### Recovery Protocol -1. Detection: How to identify stuck state (skill creation loops, validation failures) -2. Intervention: Steps to break loop (simplify tier, reduce scope) -3. Prevention: Update patterns (add blocker criteria, improve gate checks) - -## References - -For detailed information: -- **Skill Template**: [references/skill-template.md](references/skill-template.md) - Complete SKILL.md template with all sections -- **Error Catalog**: [references/error-catalog.md](references/error-catalog.md) - Common skill creation errors -- **Anti-Patterns**: [references/anti-patterns.md](references/anti-patterns.md) - What/Why/Instead for skill design mistakes -- **Workflow Patterns**: [references/workflow-patterns.md](references/workflow-patterns.md) - Reusable phase structures -- **Complexity Examples**: [references/complexity-examples.md](references/complexity-examples.md) - Skills by tier with rationale - -**Shared Patterns**: -- [anti-rationalization-core.md](../skills/shared-patterns/anti-rationalization-core.md) - Universal rationalization patterns -- [gate-enforcement.md](../skills/shared-patterns/gate-enforcement.md) - Phase gate patterns -- [output-schemas.md](../skills/shared-patterns/output-schemas.md) - Standard output formats diff --git a/agents/system-upgrade-engineer.md b/agents/system-upgrade-engineer.md index 7740567..3fbbc9f 100644 --- a/agents/system-upgrade-engineer.md +++ b/agents/system-upgrade-engineer.md @@ -82,7 +82,7 @@ You have deep expertise in: - **Priority Classification**: Ranking upgrade items as Critical / Important / Minor with effort estimates and parallel dispatch groupings - **Orchestrated Fan-Out**: Dispatching domain specialists (hook-development-engineer, - agent-creator-engineer, skill-creator-engineer) in parallel for independent changes + agent-creator-engineer, skill-creator) in parallel for independent changes - **Validation Scoring**: Using agent-evaluation before/after to quantify upgrade quality You follow the `system-upgrade` skill methodology (6 phases) and the pipeline principles: @@ -101,7 +101,7 @@ This agent operates as an orchestrator for top-down system upgrades. and wait for explicit approval before Phase 4. No silent mass-edits. Ever. - **Domain Specialists for Implementation**: Route hook changes to hook-development-engineer, agent changes to agent-creator-engineer, - skill changes to skill-creator-engineer. Do NOT implement domain changes inline. + skill changes to skill-creator. Do NOT implement domain changes inline. - **Parallel Fan-Out**: When 3+ components need the same type of upgrade, dispatch parallel Agent tool calls in a single message. - **Branch Before Implement**: Create `chore/system-upgrade-YYYY-MM-DD` branch diff --git a/agents/toolkit-governance-engineer.md b/agents/toolkit-governance-engineer.md index f6e8087..83b1455 100644 --- a/agents/toolkit-governance-engineer.md +++ b/agents/toolkit-governance-engineer.md @@ -10,7 +10,7 @@ description: | Use when a task targets the toolkit's own structure — editing skills, updating routing, checking coverage, or enforcing conventions. Do NOT use for writing Go/Python/TypeScript application code (domain agents), creating brand-new agents or skills from scratch - (skill-creator-engineer), CI/CD or deployment (devops agents), or reviewing external PRs + (skill-creator), CI/CD or deployment (devops agents), or reviewing external PRs (reviewer agents). Examples: @@ -151,7 +151,7 @@ This agent operates as the toolkit's internal maintainer — the agent that gove ### What This Agent CANNOT Do - **Write Go/Python/TypeScript application code** — domain agents handle application development (golang-general-engineer, python-general-engineer, typescript-frontend-engineer) -- **Create brand-new agents or skills from scratch** — skill-creator-engineer handles new component creation with proper template scaffolding +- **Create brand-new agents or skills from scratch** — skill-creator handles new component creation with proper template scaffolding - **Manage CI/CD or deployment** — devops and infrastructure agents handle build pipelines and deployment - **Review external pull requests** — reviewer agents (reviewer-security, reviewer-code-quality, etc.) handle PR review with specialized domain knowledge - **Modify the routing system's core logic** — the /do router's implementation is separate from the routing tables this agent manages diff --git a/docs/PHILOSOPHY.md b/docs/PHILOSOPHY.md index c29c74c..af3bbbb 100644 --- a/docs/PHILOSOPHY.md +++ b/docs/PHILOSOPHY.md @@ -215,6 +215,53 @@ The principles above describe what the system does when it works. Equally import **Stale INDEX files:** A new agent or skill was added but the INDEX wasn't regenerated. The router can't find the component. Signal: requests that should match a known agent get routed to the fallback. Recovery: run `scripts/generate-agent-index.py` and `scripts/generate-skill-index.py`. +## Skills Are Self-Contained Packages + +Everything a skill needs lives inside the skill directory. Scripts, viewer templates, bundled agents, reference files, assets — all co-located. Nothing leaks into repo-level `scripts/` or a separate `assets/` directory. + +``` +skills/my-skill/ +├── SKILL.md # The workflow +├── agents/ # Subagent prompts used only by this skill +├── scripts/ # Deterministic CLI tools this skill invokes +├── assets/ # Templates, HTML viewers, static files +└── references/ # Deep context loaded on demand +``` + +**Why this matters:** A skill that depends on scripts scattered across the repo is fragile to move, hard to test, and impossible to evaluate in isolation. When everything is bundled, the skill can be: +- Copied to another project and it works +- Tested via `run_eval.py` against its own workspace +- Reviewed as a single unit — all the tooling is visible in one tree +- Deleted without orphaning dependencies elsewhere + +**The exception:** Shared patterns (`shared-patterns/anti-rationalization-core.md`) are referenced across skills. These stay shared. But skill-specific scripts, assets, and agents are always bundled. + +**Repo-level `scripts/`** is reserved for toolkit-wide operations (learning-db.py, sync-to-user-claude.py, INDEX generation) — tools that operate on the system as a whole, not on a single skill's workflow. + +## Workflow First, Constraints Inline + +Skill documents place the workflow (Instructions/Phases) immediately after the frontmatter. Constraints appear inline within the phases they govern, not in a separate upfront section. + +**Measured result:** A/B/C testing on Go code generation showed workflow-first ordering (C) swept constraints-first ordering (B) 3-0 across simple, medium, and complex prompts. Agent blind reviewers consistently scored workflow-first higher on testing depth, Go idioms, and benchmark coverage. + +**The ordering:** + +``` +1. YAML frontmatter (What + When) +2. Brief overview (How — one paragraph) +3. Instructions/Phases (The actual workflow, with inline constraints) +4. Benchmark/Commands Guide (Reference material) +5. Error Handling (Failure context) +6. Anti-Patterns (What went wrong before) +7. References (Pointers to deep context) +``` + +**Why it works:** The model encounters the task structure before the constraint framework. Constraints appear at the decision point where they apply — "use table-driven tests because they make adding cases trivial" inside the testing phase, not in a separate Hardcoded Behaviors section 200 lines earlier. The model spends attention on understanding the task, not parsing a constraint taxonomy. + +**What moves:** The Operator Context section (Hardcoded/Default/Optional behaviors) decomposes. Each constraint migrates to the phase where it applies. "Run with -race for concurrent code" belongs in Phase 3 (RUN), not in a behavior table. + +**What stays:** Error Handling, Anti-Patterns, and References remain at the end as context that's consulted when things go wrong — not before the model has understood what "going right" looks like. + ## Open Sharing Over Individual Ownership Ideas matter less than open sharing. In an AI-assisted world, provenance becomes invisible. The toolkit is open source because: diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md index e0cd0e5..c4b2b5c 100644 --- a/docs/REFERENCE.md +++ b/docs/REFERENCE.md @@ -116,7 +116,7 @@ Request deep expertise: *"Use the [name] agent"* | `technical-documentation-engineer` | Docs, API references | | `technical-journalist-writer` | Technical journalism | | `agent-creator-engineer` | Create new agents | -| `skill-creator-engineer` | Create new skills | +| `skill-creator` | Create new skills | | `hook-development-engineer` | Claude Code hooks | | `project-coordinator-engineer` | Multi-agent orchestration | | `research-coordinator-engineer` | Research coordination | diff --git a/docs/for-claude-code.md b/docs/for-claude-code.md index 0c48efc..a875b22 100644 --- a/docs/for-claude-code.md +++ b/docs/for-claude-code.md @@ -439,7 +439,7 @@ Exit 0 = clean. Exit 1 = patterns found. | Review | reviewer-security, reviewer-business-logic, reviewer-performance, reviewer-concurrency, reviewer-dead-code | | Data | database-engineer, sqlite-peewee-engineer, data-engineer | | Content | technical-documentation-engineer, technical-journalist-writer | -| Meta | skill-creator-engineer, system-upgrade-engineer, pipeline-orchestrator-engineer, research-coordinator-engineer | +| Meta | skill-creator, system-upgrade-engineer, pipeline-orchestrator-engineer, research-coordinator-engineer | | Perses | perses-core-engineer, perses-dashboard-engineer, perses-operator-engineer, perses-plugin-engineer | | UI/Perf | ui-design-engineer, performance-optimization-engineer, react-portfolio-engineer | | Research | research-coordinator-engineer, research-subagent-executor | diff --git a/docs/for-developers.md b/docs/for-developers.md index 9fd3f6c..83e1709 100644 --- a/docs/for-developers.md +++ b/docs/for-developers.md @@ -75,7 +75,7 @@ The agent creator uses the `AGENT_TEMPLATE_V2.md` template and produces a comple /do create a skill for [your workflow] ``` -Describe the methodology, phases, and quality gates. The `skill-creator-engineer` builds the skill directory, SKILL.md with frontmatter, phase definitions, and updates the index. +Describe the methodology, phases, and quality gates. The `skill-creator` builds the skill directory, SKILL.md with frontmatter, phase definitions, and updates the index. **Example prompts:** - `/do create a skill for database migration safety with pre-migration checks, rollback validation, and post-migration verification` diff --git a/hooks/adr-enforcement.py b/hooks/adr-enforcement.py index 4f2d567..fb5bccf 100644 --- a/hooks/adr-enforcement.py +++ b/hooks/adr-enforcement.py @@ -180,17 +180,8 @@ def main() -> None: event = json.loads(raw) - # Only process PostToolUse events - event_type = event.get("hook_event_name") or event.get("type", "") - if event_type != _EVENT_NAME: - empty_output(_EVENT_NAME).print_and_exit(0) - return - - # Only act on Write or Edit tool calls - tool_name = event.get("tool_name", "") - if tool_name not in ("Write", "Edit"): - empty_output(_EVENT_NAME).print_and_exit(0) - return + # tool_name/event_type filters removed — matcher "Write|Edit" in settings.json + # prevents this hook from spawning for non-matching tools. # Extract file path from tool input tool_input = event.get("tool_input", {}) diff --git a/hooks/agent-grade-on-change.py b/hooks/agent-grade-on-change.py index 06303c1..4de4084 100644 --- a/hooks/agent-grade-on-change.py +++ b/hooks/agent-grade-on-change.py @@ -90,10 +90,8 @@ def main(): if not hook_input: return - # Check if this is a relevant tool call - tool_name = hook_input.get("tool_name", "") - if tool_name not in ("Edit", "Write"): - return + # tool_name filter removed — matcher "Write|Edit" in settings.json prevents + # this hook from spawning for non-matching tools. # Extract file path from tool input tool_input_data = hook_input.get("tool_input", {}) diff --git a/hooks/ci-merge-gate.py b/hooks/ci-merge-gate.py index f2ec425..f29d9ef 100644 --- a/hooks/ci-merge-gate.py +++ b/hooks/ci-merge-gate.py @@ -19,9 +19,8 @@ def main() -> None: data = json.loads(read_stdin(timeout=2)) - tool = data.get("tool_name", "") - if tool != "Bash": - return + # tool_name filter removed — matcher "Bash" in settings.json prevents + # this hook from spawning for non-Bash tools. command = data.get("tool_input", {}).get("command", "") diff --git a/hooks/post-tool-lint-hint.py b/hooks/post-tool-lint-hint.py index 87f9611..f93a012 100755 --- a/hooks/post-tool-lint-hint.py +++ b/hooks/post-tool-lint-hint.py @@ -69,14 +69,8 @@ def main(): event_data = read_stdin(timeout=2) event = json.loads(event_data) - # Check this is PostToolUse for Write or Edit - event_type = event.get("hook_event_name") or event.get("type", "") - if event_type != "PostToolUse": - return - - tool_name = event.get("tool_name", "") - if tool_name not in ("Write", "Edit"): - return + # tool_name/event_type filters removed — matcher "Write|Edit" in settings.json + # prevents this hook from spawning for non-matching tools. # Get the file path from tool input tool_input = event.get("tool_input", {}) diff --git a/hooks/posttool-security-scan.py b/hooks/posttool-security-scan.py index 8270b56..3fd0796 100755 --- a/hooks/posttool-security-scan.py +++ b/hooks/posttool-security-scan.py @@ -143,13 +143,8 @@ def main() -> None: raw = read_stdin(timeout=2) event = json.loads(raw) - event_type = event.get("hook_event_name") or event.get("type", "") - if event_type != "PostToolUse": - return - - tool_name = event.get("tool_name", "") - if tool_name not in ("Write", "Edit"): - return + # tool_name/event_type filters removed — matcher "Write|Edit" in settings.json + # prevents this hook from spawning for non-matching tools. tool_input = event.get("tool_input", {}) file_path = tool_input.get("file_path", "") diff --git a/hooks/posttool-session-reads.py b/hooks/posttool-session-reads.py index f1b2f62..a18400c 100755 --- a/hooks/posttool-session-reads.py +++ b/hooks/posttool-session-reads.py @@ -48,10 +48,8 @@ def main() -> None: event = json.loads(event_data) - # Only process Read tool results - tool_name = event.get("tool_name", "") - if tool_name != "Read": - return + # tool_name filter removed — matcher "Read" in settings.json prevents + # this hook from spawning for non-Read tools. # Extract file_path from tool_input tool_input = event.get("tool_input", {}) diff --git a/hooks/pretool-adr-creation-gate.py b/hooks/pretool-adr-creation-gate.py index 075c79a..a1bfd1d 100644 --- a/hooks/pretool-adr-creation-gate.py +++ b/hooks/pretool-adr-creation-gate.py @@ -70,10 +70,8 @@ def main() -> None: except (json.JSONDecodeError, ValueError): sys.exit(0) - # Only gate Write — edits to existing files are fine. - tool_name = event.get("tool_name", "") - if tool_name != "Write": - sys.exit(0) + # tool_name filter removed — matcher "Write" in settings.json prevents + # this hook from spawning for non-Write tools. # Bypass env var. if os.environ.get(_BYPASS_ENV) == "1": diff --git a/hooks/pretool-branch-safety.py b/hooks/pretool-branch-safety.py index 406dd58..5706a1e 100644 --- a/hooks/pretool-branch-safety.py +++ b/hooks/pretool-branch-safety.py @@ -60,9 +60,8 @@ def main() -> None: except (json.JSONDecodeError, ValueError): sys.exit(0) - tool_name = event.get("tool_name", "") - if tool_name != "Bash": - sys.exit(0) + # tool_name filter removed — matcher "Bash" in settings.json prevents + # this hook from spawning for non-Bash tools. command = event.get("tool_input", {}).get("command", "") if "git commit" not in command: diff --git a/hooks/pretool-creation-gate.py b/hooks/pretool-creation-gate.py index 4d4e506..2b554a4 100644 --- a/hooks/pretool-creation-gate.py +++ b/hooks/pretool-creation-gate.py @@ -4,12 +4,12 @@ PreToolUse:Write Hook: Creation Gate Blocks direct creation of new agent/skill files that bypass the -skill-creator-engineer pipeline. Forces the LLM to route through +skill-creator pipeline. Forces the LLM to route through proper creation workflows that produce full-depth components. This is a HARD GATE — it physically prevents the Write tool from creating new agent or skill files. The LLM receives a [fix-with-agent] directive -telling it to use skill-creator-engineer. +telling it to use skill-creator. Detection logic: - Tool is Write (not Edit — edits to existing files are allowed) @@ -82,9 +82,9 @@ def main() -> None: # Block: new agent or skill file being created outside the creator pipeline component_type = "agent" if is_agent else "skill" print( - f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator-engineer or skill-creation-pipeline.\n" + f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator or skill-creation-pipeline.\n" f"[creation-gate] Path: {file_path}\n" - f"[fix-with-agent] skill-creator-engineer", + f"[fix-with-agent] skill-creator", file=sys.stderr, ) sys.exit(2) diff --git a/hooks/pretool-file-backup.py b/hooks/pretool-file-backup.py index dab630a..9470068 100755 --- a/hooks/pretool-file-backup.py +++ b/hooks/pretool-file-backup.py @@ -49,9 +49,8 @@ def main() -> None: except (json.JSONDecodeError, ValueError): sys.exit(0) - tool_name = event.get("tool_name", "") - if tool_name != "Edit": - sys.exit(0) + # tool_name filter removed — matcher "Edit" in settings.json prevents + # this hook from spawning for non-Edit tools. tool_input = event.get("tool_input", {}) file_path = tool_input.get("file_path", "") diff --git a/hooks/pretool-learning-injector.py b/hooks/pretool-learning-injector.py index df5f982..5216335 100755 --- a/hooks/pretool-learning-injector.py +++ b/hooks/pretool-learning-injector.py @@ -31,9 +31,6 @@ EVENT_NAME = "PreToolUse" -# Tools that benefit from proactive learning injection -TARGET_TOOLS = {"Bash", "Edit"} - # Max characters in the injected context to stay lightweight MAX_CONTEXT_CHARS = 500 @@ -160,11 +157,9 @@ def main(): event = json.loads(event_data) - # Early exit for non-target tools + # tool_name filter removed — matcher "Bash|Edit" in settings.json prevents + # this hook from spawning for non-matching tools. tool_name = event.get("tool_name", "") - if tool_name not in TARGET_TOOLS: - empty_output(EVENT_NAME).print_and_exit() - tool_input = event.get("tool_input", {}) # Extract tags based on tool type diff --git a/hooks/pretool-plan-gate.py b/hooks/pretool-plan-gate.py index 04c7398..2b2fa0b 100644 --- a/hooks/pretool-plan-gate.py +++ b/hooks/pretool-plan-gate.py @@ -54,9 +54,8 @@ def main() -> None: except (json.JSONDecodeError, ValueError): sys.exit(0) - tool_name = event.get("tool_name", "") - if tool_name not in ("Write", "Edit"): - sys.exit(0) + # tool_name filter removed — matcher "Write|Edit" in settings.json prevents + # this hook from spawning for non-matching tools. # Bypass env var — set by the plans skill itself. if os.environ.get(_BYPASS_ENV) == "1": diff --git a/hooks/pretool-prompt-injection-scanner.py b/hooks/pretool-prompt-injection-scanner.py index 88348d6..d3502ae 100644 --- a/hooks/pretool-prompt-injection-scanner.py +++ b/hooks/pretool-prompt-injection-scanner.py @@ -268,11 +268,9 @@ def main() -> None: print(f"[injection-scanner] JSON parse failed: {e}", file=sys.stderr) empty_output(EVENT_NAME).print_and_exit() - # Field name compatibility: try new names first, fall back to old + # tool_name filter removed — matcher "Write|Edit" in settings.json prevents + # this hook from spawning for non-matching tools. tool = event.get("tool_name") or event.get("tool", "") - if tool not in ("Write", "Edit"): - empty_output(EVENT_NAME).print_and_exit() - tool_input = event.get("tool_input", event.get("input", {})) file_path = tool_input.get("file_path", "") if not file_path: diff --git a/hooks/pretool-subagent-warmstart.py b/hooks/pretool-subagent-warmstart.py index 2a1a871..1da4886 100755 --- a/hooks/pretool-subagent-warmstart.py +++ b/hooks/pretool-subagent-warmstart.py @@ -251,10 +251,8 @@ def main() -> None: event = json.loads(event_data) - # Only process Agent tool invocations - tool_name = event.get("tool_name", "") - if tool_name != "Agent": - return + # tool_name filter removed — matcher "Agent" in settings.json prevents + # this hook from spawning for non-Agent tools. # Gather context from various sources files = load_recent_reads(Path(SESSION_READS_FILE)) diff --git a/hooks/pretool-synthesis-gate.py b/hooks/pretool-synthesis-gate.py index 086932b..f092066 100755 --- a/hooks/pretool-synthesis-gate.py +++ b/hooks/pretool-synthesis-gate.py @@ -123,9 +123,8 @@ def main() -> None: except (json.JSONDecodeError, ValueError): sys.exit(0) - tool_name = event.get("tool_name", "") - if tool_name not in ("Write", "Edit"): - sys.exit(0) + # tool_name filter removed — matcher "Write|Edit" in settings.json prevents + # this hook from spawning for non-matching tools. # Bypass env var — set by the consultation skill itself. if os.environ.get(_BYPASS_ENV) == "1": diff --git a/hooks/pretool-unified-gate.py b/hooks/pretool-unified-gate.py index 81d6751..79b0cfe 100644 --- a/hooks/pretool-unified-gate.py +++ b/hooks/pretool-unified-gate.py @@ -295,9 +295,9 @@ def check_creation_gate(file_path: str) -> None: component_type = "agent" if is_agent else "skill" _block( - f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator-engineer or skill-creation-pipeline.\n" + f"[creation-gate] BLOCKED: New {component_type} must be created via skill-creator or skill-creation-pipeline.\n" f"[creation-gate] Path: {file_path}\n" - f"[fix-with-agent] skill-creator-engineer" + f"[fix-with-agent] skill-creator" ) diff --git a/hooks/record-activation.py b/hooks/record-activation.py index 9ac1cb3..e52fdfb 100644 --- a/hooks/record-activation.py +++ b/hooks/record-activation.py @@ -28,18 +28,14 @@ from hook_utils import get_session_id from stdin_timeout import read_stdin -# Tools that represent meaningful work completing successfully -TRACKED_TOOLS = {"Edit", "Write", "Bash"} - def main() -> None: """Record session activation stats on successful tool completions.""" try: hook_input = json.loads(read_stdin(timeout=2)) - tool_name = hook_input.get("tool_name", "") - if tool_name not in TRACKED_TOOLS: - return + # tool_name filter removed — matcher "Edit|Write|Bash" in settings.json + # prevents this hook from spawning for non-matching tools. tool_result = hook_input.get("tool_result", {}) if tool_result.get("is_error", False): diff --git a/hooks/retro-graduation-gate.py b/hooks/retro-graduation-gate.py index 76bcc3c..f7900b7 100644 --- a/hooks/retro-graduation-gate.py +++ b/hooks/retro-graduation-gate.py @@ -30,16 +30,8 @@ def main() -> None: empty_output(EVENT).print_and_exit(0) return - # Event type guard (defensive — matches peer hook pattern) - event_type = data.get("hook_event_name") or data.get("type", "") - if event_type and event_type != EVENT: - empty_output(EVENT).print_and_exit(0) - return - - # Early-exit: only care about Bash tool (PostToolUse schema: tool_name) - if data.get("tool_name") != "Bash": - empty_output(EVENT).print_and_exit(0) - return + # tool_name/event_type filters removed — matcher "Bash" in settings.json + # prevents this hook from spawning for non-Bash tools. # Early-exit: check if output indicates a PR was created (PostToolUse schema: tool_result.output) tool_result = data.get("tool_result", {}) diff --git a/hooks/review-capture.py b/hooks/review-capture.py index 724f1db..9883a06 100644 --- a/hooks/review-capture.py +++ b/hooks/review-capture.py @@ -117,10 +117,8 @@ def main() -> None: event = json.loads(event_data) - # Only process Agent tool results - tool_name = event.get("tool_name", "") - if tool_name != "Agent": - return + # tool_name filter removed — matcher "Agent" in settings.json prevents + # this hook from spawning for non-Agent tools. # Get tool result text tool_result = event.get("tool_result", "") diff --git a/hooks/skill-evaluator.py b/hooks/skill-evaluator.py index d142510..39402d5 100644 --- a/hooks/skill-evaluator.py +++ b/hooks/skill-evaluator.py @@ -43,7 +43,7 @@ "testing-automation-engineer": "Unit/E2E tests, Playwright, CI pipelines", # Meta/Creation "agent-creator-engineer": "Create new specialized agents", - "skill-creator-engineer": "Create new Claude skills", + "skill-creator": "Create new Claude skills", "hook-development-engineer": "Create Claude Code hooks, event handlers", "mcp-local-docs-engineer": "Build MCP servers for documentation", # Coordination @@ -151,7 +151,7 @@ def get_evaluation_prompt(complexity: str) -> str: - Docs: technical-documentation-engineer, technical-journalist-writer - UI: ui-design-engineer, performance-optimization-engineer - Testing: testing-automation-engineer -- Meta: agent-creator-engineer, skill-creator-engineer, hook-development-engineer +- Meta: agent-creator-engineer, skill-creator, hook-development-engineer - Research: research-coordinator-engineer, project-coordinator-engineer - Critique: roast skill (5 personas: contrarian, newcomer, builder, senior, pedant)""" diff --git a/hooks/tests/test_post_tool_lint.py b/hooks/tests/test_post_tool_lint.py index 70ae6d3..88102b1 100755 --- a/hooks/tests/test_post_tool_lint.py +++ b/hooks/tests/test_post_tool_lint.py @@ -94,7 +94,11 @@ def test_ignores_non_lintable_files(): def test_ignores_read_tool(): - """Hook should only trigger for Write/Edit, not Read.""" + """Read tool filtering is now handled by matcher 'Write|Edit' in settings.json. + + When called directly (without matcher), the hook processes any tool_name. + This test verifies the hook still exits 0 (non-blocking) for any input. + """ setup() event = { "type": "PostToolUse", @@ -104,7 +108,7 @@ def test_ignores_read_tool(): stdout, stderr, code = run_hook(event) assert code == 0 - assert stdout == "" + # Note: hook may produce output since tool_name filter was moved to matcher def test_handles_missing_file_path(): diff --git a/hooks/tests/test_posttool_session_reads.py b/hooks/tests/test_posttool_session_reads.py index 8e0fc05..6105971 100644 --- a/hooks/tests/test_posttool_session_reads.py +++ b/hooks/tests/test_posttool_session_reads.py @@ -51,30 +51,23 @@ def run_hook(event: dict) -> tuple[str, str, int]: class TestToolNameFiltering: """Only Read tool events should be processed.""" - def test_ignores_write_tool(self, tmp_path, monkeypatch): - """Write tool events should produce no output and no file.""" - monkeypatch.chdir(tmp_path) - event = { - "tool_name": "Write", - "tool_input": {"file_path": "/some/file.py"}, - } - stdout, stderr, code = run_hook(event) - assert code == 0 - # No session-reads.txt should be created - assert not (tmp_path / ".claude" / "session-reads.txt").exists() + def test_nonread_tool_exits_zero(self, tmp_path, monkeypatch): + """Non-Read tool filtering is now handled by matcher 'Read' in settings.json. - def test_ignores_edit_tool(self, tmp_path, monkeypatch): - """Edit tool events should be ignored.""" + When called directly (without matcher), the hook processes any tool_name. + This test verifies the hook still exits 0 (non-blocking) for any input. + """ monkeypatch.chdir(tmp_path) - event = { - "tool_name": "Edit", - "tool_input": {"file_path": "/some/file.py"}, - } - stdout, stderr, code = run_hook(event) - assert code == 0 + for tool in ("Write", "Edit", "Bash"): + event = { + "tool_name": tool, + "tool_input": {"file_path": "/some/file.py"} if tool != "Bash" else {"command": "ls"}, + } + stdout, stderr, code = run_hook(event) + assert code == 0 def test_ignores_bash_tool(self, tmp_path, monkeypatch): - """Bash tool events should be ignored.""" + """Bash tool events should be ignored (no file_path to extract).""" monkeypatch.chdir(tmp_path) event = { "tool_name": "Bash", diff --git a/hooks/tests/test_pretool_subagent_warmstart.py b/hooks/tests/test_pretool_subagent_warmstart.py index f8a1b51..62da3c1 100644 --- a/hooks/tests/test_pretool_subagent_warmstart.py +++ b/hooks/tests/test_pretool_subagent_warmstart.py @@ -58,28 +58,19 @@ def run_hook(event: dict) -> tuple[str, str, int]: class TestToolNameFiltering: """Only Agent tool events should be processed.""" - def test_ignores_read_tool(self): - """Read tool events should produce no context output.""" - event = {"tool_name": "Read", "tool_input": {"file_path": "/x"}} - stdout, stderr, code = run_hook(event) - assert code == 0 - # Should be empty or empty hook output (no warmstart context) - if stdout.strip(): - output = json.loads(stdout) - hook_out = output.get("hookSpecificOutput", {}) - assert "additionalContext" not in hook_out or "[warmstart]" not in hook_out.get("additionalContext", "") - - def test_ignores_write_tool(self): - """Write tool events should be ignored.""" - event = {"tool_name": "Write", "tool_input": {"file_path": "/x"}} - stdout, stderr, code = run_hook(event) - assert code == 0 - - def test_ignores_bash_tool(self): - """Bash tool events should be ignored.""" - event = {"tool_name": "Bash", "tool_input": {"command": "ls"}} - stdout, stderr, code = run_hook(event) - assert code == 0 + def test_nonagent_tools_exit_zero(self): + """Non-Agent tool filtering is now handled by matcher 'Agent' in settings.json. + + When called directly (without matcher), the hook processes any tool_name. + This test verifies the hook still exits 0 (non-blocking) for any input. + """ + for tool, tool_input in [ + ("Read", {"file_path": "/x"}), + ("Write", {"file_path": "/x"}), + ("Bash", {"command": "ls"}), + ]: + stdout, stderr, code = run_hook({"tool_name": tool, "tool_input": tool_input}) + assert code == 0 def test_processes_agent_tool(self, tmp_path, monkeypatch): """Agent tool events should produce warmstart context.""" diff --git a/hooks/usage-tracker.py b/hooks/usage-tracker.py index 6ea3847..73626de 100644 --- a/hooks/usage-tracker.py +++ b/hooks/usage-tracker.py @@ -32,17 +32,10 @@ def main(): event = json.loads(event_data) - # Only process PostToolUse events - event_type = event.get("hook_event_name") or event.get("type", "") - if event_type != "PostToolUse": - return - + # tool_name/event_type filters removed — matcher "Skill|Agent" in settings.json + # prevents this hook from spawning for non-matching tools. tool_name = event.get("tool_name", "") - # Only track Skill and Agent tools — exit silently for everything else - if tool_name not in ("Skill", "Agent"): - return - # Lazy import — only loaded when we actually need to record from hook_utils import get_project_dir, get_session_id from usage_db import record_agent, record_skill diff --git a/pipelines/INDEX.json b/pipelines/INDEX.json index 464d163..13a4e88 100644 --- a/pipelines/INDEX.json +++ b/pipelines/INDEX.json @@ -27,7 +27,7 @@ "agent-evaluation", "system-upgrade" ], - "agent": "skill-creator-engineer" + "agent": "skill-creator" }, "article-evaluation-pipeline": { "file": "pipelines/article-evaluation-pipeline/SKILL.md", @@ -626,7 +626,7 @@ "agent-evaluation", "routing-table-updater" ], - "agent": "skill-creator-engineer" + "agent": "skill-creator" }, "system-upgrade": { "file": "pipelines/system-upgrade/SKILL.md", diff --git a/pipelines/agent-upgrade/SKILL.md b/pipelines/agent-upgrade/SKILL.md index 5230821..69032bd 100644 --- a/pipelines/agent-upgrade/SKILL.md +++ b/pipelines/agent-upgrade/SKILL.md @@ -10,7 +10,7 @@ description: | version: 1.0.0 user-invocable: false argument-hint: "" -agent: skill-creator-engineer +agent: skill-creator allowed-tools: - Read - Bash diff --git a/pipelines/pipeline-scaffolder/references/architecture-rules.md b/pipelines/pipeline-scaffolder/references/architecture-rules.md index 29afef7..fd34cd1 100644 --- a/pipelines/pipeline-scaffolder/references/architecture-rules.md +++ b/pipelines/pipeline-scaffolder/references/architecture-rules.md @@ -84,7 +84,7 @@ Phase 1: DISCOVER (sequential — needs full context) ↓ Phase 2: SCAFFOLD (fan-out — group by creator type) ├─ agent-creator-engineer: Agent A, Agent B, Agent C (1..N) - ├─ skill-creator-engineer: Skill X, Skill Y (1..M) + ├─ skill-creator: Skill X, Skill Y (1..M) ├─ hook-development-engineer: Hook 1, Hook 2 (1..K) └─ Direct: Script 1, Script 2 (1..J) ↓ (fan-in — wait for all) diff --git a/pipelines/skill-creation-pipeline/SKILL.md b/pipelines/skill-creation-pipeline/SKILL.md index 6a1e0f5..f3a37fc 100644 --- a/pipelines/skill-creation-pipeline/SKILL.md +++ b/pipelines/skill-creation-pipeline/SKILL.md @@ -8,7 +8,7 @@ description: | Use for "create skill pipeline", "new skill formal", "skill with gates". version: 1.0.0 user-invocable: false -agent: skill-creator-engineer +agent: skill-creator allowed-tools: - Read - Bash @@ -38,7 +38,7 @@ routing: ## Operator Context -This pipeline wraps `skill-creator-engineer` with explicit discovery, design +This pipeline wraps `skill-creator` with explicit discovery, design review, and validation gates. It is the **formal path** for creating new skills — as opposed to ad-hoc creation — and should be used whenever skill quality, uniqueness, or routing correctness is important. The pipeline does not replace @@ -187,7 +187,7 @@ DESIGN BRIEF: [skill-name] ========================== Complexity Tier: [Simple | Medium | Complex | Comprehensive] -Agent Binding: skill-creator-engineer (default) or [other agent if domain-specific] +Agent Binding: skill-creator (default) or [other agent if domain-specific] User-Invocable: [true | false] Phases: @@ -323,7 +323,7 @@ Read the current INDEX.json and append an entry for the new skill: "path": "skills/skill-name/SKILL.md", "description": "[first line of the frontmatter description]", "user-invocable": true, - "agent": "skill-creator-engineer" + "agent": "skill-creator" } ``` diff --git a/pipelines/system-upgrade/SKILL.md b/pipelines/system-upgrade/SKILL.md index 55f7f9c..7bb5b02 100644 --- a/pipelines/system-upgrade/SKILL.md +++ b/pipelines/system-upgrade/SKILL.md @@ -46,7 +46,7 @@ complementing the **bottom-up** retro-knowledge-injector. ### Hardcoded Behaviors (Always Apply) - **Show Plan Before Implementing**: Phase 3 output (ranked upgrade list) MUST be presented to the user and approved before Phase 4 begins. Never silently execute upgrades. -- **Reuse Domain Agents**: Phase 4 (IMPLEMENT) dispatches to existing domain agents (skill-creator-engineer, agent-creator-engineer, hook-development-engineer, golang-general-engineer, etc.). The upgrade engineer orchestrates; specialists execute. +- **Reuse Domain Agents**: Phase 4 (IMPLEMENT) dispatches to existing domain agents (skill-creator, agent-creator-engineer, hook-development-engineer, golang-general-engineer, etc.). The upgrade engineer orchestrates; specialists execute. - **Parallel Fan-Out**: When 3+ components need the same type of upgrade, dispatch in parallel using multiple Agent tool calls in a single message. - **Score Delta Required**: Phase 5 (VALIDATE) must produce before/after evaluation delta, not just "looks good." Use `agent-evaluation` skill. - **Trigger Type Determines Input**: The three trigger types (claude-release, goal-change, retro-driven) require different input parsing in Phase 1. @@ -202,7 +202,7 @@ IMPORTANT (should fix): 4. skills/go-testing/SKILL.md — Apply new pattern from retro L2 [inject-pattern, ~10min] MINOR (nice to have): - 5. agents/skill-creator-engineer.md — Add new frontmatter field docs [upgrade, ~5min] + 5. agents/skill-creator.md — Add new frontmatter field docs [upgrade, ~5min] Total: 5 changes across 5 components Parallel dispatch: 3 groups (hooks, agents, skills) @@ -232,10 +232,10 @@ git checkout -b chore/system-upgrade-$(date +%Y-%m-%d) | Change Domain | Domain Agent | |--------------|-------------| | Hook modifications | hook-development-engineer | -| Agent upgrades | agent-creator-engineer (or skill-creator-engineer for agents) | -| Skill upgrades | skill-creator-engineer | +| Agent upgrades | agent-creator-engineer (or skill-creator for agents) | +| Skill upgrades | skill-creator | | Routing changes | routing-table-updater | -| Pattern injection | skill-creator-engineer or direct Edit | +| Pattern injection | skill-creator or direct Edit | **Step 2**: Dispatch parallel agents for independent groups. Use a single message with multiple Agent tool calls for changes that don't depend on each other. @@ -365,7 +365,7 @@ Solution: Manually copy modified files to `~/.claude/` equivalent directories. R ### Anti-Pattern 2: Handling All Changes Directly Instead of Dispatching **What it looks like**: Making all edits inline rather than routing to domain agents -**Why wrong**: Domain agents (skill-creator-engineer, hook-development-engineer) know the templates and anti-patterns for their domain +**Why wrong**: Domain agents (skill-creator, hook-development-engineer) know the templates and anti-patterns for their domain **Do instead**: Dispatch to domain agents for anything beyond simple pattern injection ### Anti-Pattern 3: Auditing Everything Every Time diff --git a/scripts/audit-tool-restrictions.py b/scripts/audit-tool-restrictions.py index 6f5e886..1e0301e 100644 --- a/scripts/audit-tool-restrictions.py +++ b/scripts/audit-tool-restrictions.py @@ -131,7 +131,7 @@ "python-openstack-engineer": "code-modifier", "rabbitmq-messaging-engineer": "code-modifier", "react-portfolio-engineer": "code-modifier", - "skill-creator-engineer": "code-modifier", + "skill-creator": "code-modifier", "sqlite-peewee-engineer": "code-modifier", "testing-automation-engineer": "code-modifier", "typescript-debugging-engineer": "code-modifier", diff --git a/scripts/routing-benchmark.json b/scripts/routing-benchmark.json index f41d1a9..7f80cdc 100644 --- a/scripts/routing-benchmark.json +++ b/scripts/routing-benchmark.json @@ -284,10 +284,9 @@ }, { "request": "create a new Claude Code skill with quality gates", - "expected_agent": "skill-creator-engineer", - "expected_skill": "skill-creation-pipeline", + "expected_skill": "skill-creator", "category": "meta-tooling", - "notes": "Skill creation — agent + pipeline pairing" + "notes": "Skill creation — skill-creator handles the full eval-driven workflow" }, { "request": "create a new hook for PostToolUse events", diff --git a/skills/INDEX.json b/skills/INDEX.json index 385355c..c9bc4de 100644 --- a/skills/INDEX.json +++ b/skills/INDEX.json @@ -1,6 +1,6 @@ { "version": "2.0", - "generated": "2026-03-25T23:05:47Z", + "generated": "2026-03-27T03:14:10Z", "generated_by": "scripts/generate-skill-index.py", "skills": { "adr-consultation": { @@ -16,7 +16,7 @@ "adr consultation" ], "category": "meta", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "feature-design", @@ -121,7 +121,7 @@ "find unused" ], "category": "code-quality", - "user_invocable": true, + "user_invocable": false, "version": "2.0.0" }, "code-linting": { @@ -206,26 +206,27 @@ }, "content-engine": { "file": "skills/content-engine/SKILL.md", - "description": "Repurpose a source asset into platform-native social content variants for X, LinkedIn, TikTok, YouTube, and newsletter. Produces content_ideas.md and content_drafts.md with a quality gate before delivery.", + "description": "Repurpose a source asset (article, demo, launch note, insight) into platform-native social content variants.", "triggers": [ "repurpose this", "adapt for social", "turn this into posts", "content from article", "content from demo", + "content from doc", "write variants for", "social content from", "platform variants", "repurpose for" ], "category": "content", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "x-api", "crosspost" ], - "disambiguate": "voice-writer" + "model": "sonnet" }, "create-voice": { "file": "skills/create-voice/SKILL.md", @@ -241,7 +242,7 @@ ], "category": "content", "force_route": true, - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "voice-calibrator", @@ -358,7 +359,7 @@ "10 perspectives" ], "category": "meta-tooling", - "user_invocable": true, + "user_invocable": false, "version": "2.0.0" }, "docs-sync-checker": { @@ -385,16 +386,15 @@ "POM", "test flakiness" ], - "category": "testing", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", - "agent": "testing-automation-engineer", - "model": "sonnet", "pairs_with": [ "testing-automation-engineer", "typescript-frontend-engineer", "test-driven-development" - ] + ], + "agent": "testing-automation-engineer", + "model": "sonnet" }, "endpoint-validator": { "file": "skills/endpoint-validator/SKILL.md", @@ -425,7 +425,7 @@ ], "category": "process", "force_route": true, - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [] }, @@ -442,7 +442,7 @@ ], "category": "process", "force_route": true, - "user_invocable": true, + "user_invocable": false, "version": "2.0.0", "pairs_with": [ "feature-plan", @@ -461,7 +461,7 @@ ], "category": "process", "force_route": true, - "user_invocable": true, + "user_invocable": false, "version": "2.0.0", "pairs_with": [ "feature-plan", @@ -481,7 +481,7 @@ ], "category": "process", "force_route": true, - "user_invocable": true, + "user_invocable": false, "version": "2.0.0", "pairs_with": [ "feature-design", @@ -501,7 +501,7 @@ ], "category": "process", "force_route": true, - "user_invocable": true, + "user_invocable": false, "version": "2.0.0", "pairs_with": [ "feature-validate", @@ -521,7 +521,7 @@ ], "category": "process", "force_route": true, - "user_invocable": true, + "user_invocable": false, "version": "2.0.0", "pairs_with": [ "feature-implement", @@ -553,32 +553,6 @@ "version": "2.0.0", "pairs_with": [] }, - "frontend-slides": { - "file": "skills/frontend-slides/SKILL.md", - "description": "Browser-based HTML presentation generation with viewport-fit enforcement, curated style presets, and deterministic overflow validation. Three paths: new build, PPTX-to-HTML conversion, or HTML deck enhancement.", - "triggers": [ - "HTML slides", - "browser presentation", - "web deck", - "reveal-style", - "viewport presentation", - "convert PPTX to web", - "convert PPTX to HTML", - "slides for a browser", - "kiosk presentation", - "interactive presentation keyboard", - "projector browser" - ], - "category": "frontend", - "user_invocable": true, - "version": "1.0.0", - "agent": "typescript-frontend-engineer", - "model": "sonnet", - "pairs_with": [ - "typescript-frontend-engineer", - "pptx-generator" - ] - }, "forensics": { "file": "skills/forensics/SKILL.md", "description": "Post-mortem diagnostic analysis of failed or stuck workflows.", @@ -597,7 +571,7 @@ "incident review" ], "category": "process", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "systematic-debugging", @@ -605,6 +579,32 @@ "plan-checker" ] }, + "frontend-slides": { + "file": "skills/frontend-slides/SKILL.md", + "description": "Browser-based HTML presentation generation with viewport-fit enforcement.", + "triggers": [ + "HTML slides", + "browser presentation", + "web deck", + "reveal-style", + "viewport presentation", + "convert PPTX to web", + "convert PPTX to HTML", + "slides for a browser", + "kiosk presentation", + "interactive presentation keyboard", + "projector browser" + ], + "category": "frontend", + "user_invocable": false, + "version": "1.0.0", + "pairs_with": [ + "typescript-frontend-engineer", + "pptx-generator" + ], + "agent": "typescript-frontend-engineer", + "model": "sonnet" + }, "full-repo-review": { "file": "skills/full-repo-review/SKILL.md", "description": "Run comprehensive 3-wave review against all source files in the repo, producing a prioritized issue backlog.", @@ -654,7 +654,7 @@ "make claude.md" ], "category": "documentation", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "go-sapcc-conventions", @@ -667,11 +667,7 @@ "triggers": [ "commit", "stage and commit", - "commit changes", - "save my work", - "commit this", - "save progress", - "checkpoint" + "commit changes" ], "category": "git-workflow", "force_route": true, @@ -703,7 +699,7 @@ "github inbox" ], "category": "github", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [], "model": "sonnet" @@ -875,7 +871,7 @@ "headless agent" ], "category": "process", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "agent": "python-general-engineer" }, @@ -940,7 +936,7 @@ "wiring check" ], "category": "process", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "feature-implement", @@ -961,7 +957,7 @@ "reframe positively" ], "category": "content", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "voice-writer", @@ -969,6 +965,54 @@ "voice-validator" ] }, + "kotlin-coroutines": { + "file": "skills/kotlin-coroutines/SKILL.md", + "description": "Kotlin structured concurrency, Flow, Channel, and cancellation patterns", + "triggers": [ + "kotlin-coroutines", + "kotlin", + "coroutines" + ], + "user_invocable": false, + "version": "1.0.0", + "agent": "general-purpose" + }, + "kotlin-testing": { + "file": "skills/kotlin-testing/SKILL.md", + "description": "Kotlin testing patterns with JUnit 5, Kotest, and coroutine test dispatchers", + "triggers": [ + "kotlin-testing", + "kotlin", + "testing" + ], + "user_invocable": false, + "version": "1.0.0", + "agent": "general-purpose" + }, + "kubernetes-debugging": { + "file": "skills/kubernetes-debugging/SKILL.md", + "description": "Kubernetes debugging methodology for pod failures, networking issues, and resource problems", + "triggers": [ + "kubernetes-debugging", + "kubernetes", + "debugging" + ], + "user_invocable": false, + "version": "1.0.0", + "agent": "kubernetes-helm-engineer" + }, + "kubernetes-security": { + "file": "skills/kubernetes-security/SKILL.md", + "description": "Kubernetes security patterns including RBAC, PodSecurityStandards, network policies, and secret management", + "triggers": [ + "kubernetes-security", + "kubernetes", + "security" + ], + "user_invocable": false, + "version": "1.0.0", + "agent": "kubernetes-helm-engineer" + }, "learn": { "file": "skills/learn/SKILL.md", "description": "Manually teach Claude Code an error pattern and its solution, storing it in the learning database with high confidence.", @@ -978,7 +1022,7 @@ "manual learning entry" ], "category": "meta-tooling", - "user_invocable": true, + "user_invocable": false, "version": "2.0.0" }, "link-auditor": { @@ -1058,7 +1102,7 @@ "wrap up session" ], "category": "process", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "resume-work" @@ -1167,7 +1211,7 @@ "first-time Perses setup" ], "category": "perses", - "user_invocable": true, + "user_invocable": false, "version": "2.0.0", "agent": "perses-dashboard-engineer" }, @@ -1232,6 +1276,30 @@ "version": "2.0.0", "agent": "perses-dashboard-engineer" }, + "php-quality": { + "file": "skills/php-quality/SKILL.md", + "description": "PHP code quality patterns including PSR standards, strict types, and framework idioms", + "triggers": [ + "php-quality", + "php", + "quality" + ], + "user_invocable": false, + "version": "1.0.0", + "agent": "general-purpose" + }, + "php-testing": { + "file": "skills/php-testing/SKILL.md", + "description": "PHP testing patterns with PHPUnit, test doubles, and database testing", + "triggers": [ + "php-testing", + "php", + "testing" + ], + "user_invocable": false, + "version": "1.0.0", + "agent": "general-purpose" + }, "plan-checker": { "file": "skills/plan-checker/SKILL.md", "description": "Validate plans against 10 verification dimensions before execution begins.", @@ -1245,7 +1313,7 @@ "pre-execution check" ], "category": "process", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "feature-plan", @@ -1301,7 +1369,7 @@ "plant-seed" ], "category": "process", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "feature-design" @@ -1352,7 +1420,7 @@ "prune branches" ], "category": "git-workflow", - "user_invocable": true, + "user_invocable": false, "version": "2.0.0" }, "pr-fix": { @@ -1364,7 +1432,7 @@ "pr-fix" ], "category": "git-workflow", - "user_invocable": true, + "user_invocable": false, "version": "2.0.0" }, "pr-miner": { @@ -1397,7 +1465,7 @@ "address review comments" ], "category": "git-workflow", - "user_invocable": true, + "user_invocable": false, "version": "2.0.0" }, "pr-status": { @@ -1410,7 +1478,7 @@ ], "category": "git-workflow", "force_route": true, - "user_invocable": true, + "user_invocable": false, "version": "2.0.0" }, "pr-sync": { @@ -1420,11 +1488,7 @@ "push", "push changes", "create PR", - "sync to GitHub", - "open a pull request", - "make a PR", - "submit PR", - "push and PR" + "sync to GitHub" ], "category": "git-workflow", "force_route": true, @@ -1445,7 +1509,7 @@ ], "category": "process", "force_route": true, - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "feature-design", @@ -1542,7 +1606,7 @@ "Reddit reports" ], "category": "process", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "agent": "python-general-engineer" }, @@ -1558,7 +1622,7 @@ "read every file in repo" ], "category": "analysis", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "explore-pipeline" @@ -1609,7 +1673,7 @@ "poke holes in this" ], "category": "analysis", - "user_invocable": true, + "user_invocable": false, "version": "2.0.0" }, "routing-table-updater": { @@ -1635,7 +1699,7 @@ "sapcc standards check" ], "category": "language", - "user_invocable": true, + "user_invocable": false, "version": "2.0.0", "pairs_with": [ "golang-general-engineer", @@ -1656,7 +1720,7 @@ "review sapcc standards" ], "category": "language", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "golang-general-engineer", @@ -1666,6 +1730,29 @@ "agent": "golang-general-engineer", "model": "opus" }, + "security-threat-model": { + "file": "skills/security-threat-model/SKILL.md", + "description": "Phase-gated security threat model skill.", + "triggers": [ + "threat model", + "security audit", + "supply chain scan", + "deny list", + "learning db sanitize", + "security posture", + "injection scan", + "surface scan", + "audit hooks", + "audit skills" + ], + "category": "security", + "user_invocable": false, + "version": "1.0.0", + "pairs_with": [ + "python-general-engineer" + ], + "model": "opus" + }, "seo-optimizer": { "file": "skills/seo-optimizer/SKILL.md", "description": "Analyze and optimize blog post SEO: keywords, titles, meta descriptions, headers, and internal linking.", @@ -1716,6 +1803,27 @@ "user_invocable": false, "version": "2.0.0" }, + "skill-creator": { + "file": "skills/skill-creator/SKILL.md", + "description": "Create new skills and iteratively improve them through eval-driven validation.", + "triggers": [ + "create skill", + "new skill", + "skill template", + "skill design", + "test skill", + "improve skill", + "optimize description", + "skill eval" + ], + "category": "meta", + "user_invocable": false, + "version": "2.0.0", + "pairs_with": [ + "agent-evaluation", + "verification-before-completion" + ] + }, "skill-eval": { "file": "skills/skill-eval/SKILL.md", "description": "Evaluate and improve skills through measured testing.", @@ -1729,13 +1837,13 @@ "skill quality" ], "category": "meta", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "agent-evaluation", "verification-before-completion" ], - "agent": "skill-creator-engineer" + "agent": "skill-creator" }, "socratic-debugging": { "file": "skills/socratic-debugging/SKILL.md", @@ -1788,6 +1896,30 @@ "user_invocable": false, "version": "2.0.0" }, + "swift-concurrency": { + "file": "skills/swift-concurrency/SKILL.md", + "description": "Swift structured concurrency with async/await, Actor, Task, and Sendable patterns", + "triggers": [ + "swift-concurrency", + "swift", + "concurrency" + ], + "user_invocable": false, + "version": "1.0.0", + "agent": "general-purpose" + }, + "swift-testing": { + "file": "skills/swift-testing/SKILL.md", + "description": "Swift testing patterns with XCTest, Swift Testing framework, and async test patterns", + "triggers": [ + "swift-testing", + "swift", + "testing" + ], + "user_invocable": false, + "version": "1.0.0", + "agent": "general-purpose" + }, "systematic-code-review": { "file": "skills/systematic-code-review/SKILL.md", "description": "4-phase code review methodology: UNDERSTAND changes, VERIFY claims against code, ASSESS security/performance/architecture risks, DOCUMENT findings with severity classification.", @@ -1984,12 +2116,12 @@ "assemble clips", "video editing" ], - "category": "media", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "typescript-frontend-engineer" ], + "agent": "python-general-engineer", "model": "sonnet" }, "vitest-runner": { @@ -2037,7 +2169,7 @@ "strict verification" ], "category": "process", - "user_invocable": true, + "user_invocable": false, "version": "2.0.0" }, "wordpress-live-validation": { @@ -2099,7 +2231,7 @@ }, "x-api": { "file": "skills/x-api/SKILL.md", - "description": "Post tweets, build threads, upload media, and read timelines via the X API with OAuth 1.0a/2.0 and a mandatory confirm gate before any write.", + "description": "Post tweets, build threads, upload media, and read timelines via the X API.", "triggers": [ "post to X", "post tweet", @@ -2116,12 +2248,14 @@ "publish to twitter" ], "category": "content-publishing", - "user_invocable": true, + "user_invocable": false, "version": "1.0.0", "pairs_with": [ "content-engine", "crosspost" - ] + ], + "agent": "python-general-engineer", + "model": "sonnet" } } } diff --git a/skills/agent-evaluation/SKILL.md b/skills/agent-evaluation/SKILL.md index 761fde1..2534f9e 100644 --- a/skills/agent-evaluation/SKILL.md +++ b/skills/agent-evaluation/SKILL.md @@ -60,7 +60,7 @@ This skill operates as an operator for agent/skill quality assurance, configurin - Batch-evaluate entire collections with summary statistics ## What This Skill CANNOT Do -- Modify or fix agents/skills (use skill-creator-engineer instead) +- Modify or fix agents/skills (use skill-creator instead) - Evaluate external repositories or non-agent/skill files - Replace human judgment on content accuracy or domain correctness - Skip rubric categories — all must be scored diff --git a/skills/do/references/routing-tables.md b/skills/do/references/routing-tables.md index 83866ae..9063415 100644 --- a/skills/do/references/routing-tables.md +++ b/skills/do/references/routing-tables.md @@ -33,7 +33,7 @@ Route to these agents based on the user's task domain. Each entry describes what | **project-coordinator-engineer** | User needs multi-agent coordination for a large project: spawning parallel agents, tracking cross-cutting tasks, or orchestrating a multi-phase effort. | | **pipeline-orchestrator-engineer** | User wants to create a new pipeline, scaffold a new structured workflow, or compose pipeline phases. | | **hook-development-engineer** | User wants to create or modify Python hooks for Claude Code's event-driven system (SessionStart, PostToolUse, etc.). | -| **skill-creator-engineer** | User wants to create or improve a Claude Code skill, workflow automation, or agent configuration. | +| **skill-creator** | User wants to create or improve a Claude Code skill, workflow automation, or agent configuration. | | **system-upgrade-engineer** | User wants to upgrade the agent/skill/hook ecosystem after a Claude model update or system-wide change. | | **technical-documentation-engineer** | User needs technical documentation created, maintained, or validated — API docs, READMEs, architecture guides. | | **technical-journalist-writer** | User needs professional technical writing in a journalism style — articles, posts, or content with a specific authored voice. | @@ -46,7 +46,7 @@ Route to these agents based on the user's task domain. Each entry describes what | **github-profile-rules-engineer** | User wants to extract coding conventions, programming rules, or style guidelines from a GitHub profile's repositories. | | **react-portfolio-engineer** | User is building a React portfolio or gallery website, typically for creative professionals. | | **nextjs-ecommerce-engineer** | User is building an e-commerce site with Next.js: product pages, cart, checkout flows. | -| **toolkit-governance-engineer** | User wants to maintain or modify the toolkit's own internal structure: editing skill/agent files, updating routing tables, managing ADRs, regenerating INDEX.json, or enforcing frontmatter compliance. NOT: creating brand-new agents (use skill-creator-engineer), writing application code (domain agents), or reviewing external PRs (reviewer agents). | +| **toolkit-governance-engineer** | User wants to maintain or modify the toolkit's own internal structure: editing skill/agent files, updating routing tables, managing ADRs, regenerating INDEX.json, or enforcing frontmatter compliance. NOT: creating brand-new agents (use skill-creator), writing application code (domain agents), or reviewing external PRs (reviewer agents). | --- @@ -229,10 +229,10 @@ All pipelines live in the `pipelines/` directory (synced to `~/.claude/skills/` |----------|--------------------|--------| | **pipeline-scaffolder** (pipeline-orchestrator-engineer) | User wants to create a new pipeline, scaffold a new structured workflow from a spec. | LOAD → SCAFFOLD → INTEGRATE → REPORT | | **system-upgrade** (system-upgrade-engineer) | User wants to upgrade the Claude Code toolkit after a model update, apply system-wide changes, or roll out agent improvements. NOT: upgrading a specific library dependency in user code. | CHANGELOG → AUDIT → PLAN → IMPLEMENT → VALIDATE → DEPLOY | -| **skill-creation-pipeline** (skill-creator-engineer) | User wants to create a new skill with formal quality gates, phase structure, and integration. | DISCOVER → DESIGN → SCAFFOLD → VALIDATE → INTEGRATE | +| **skill-creation-pipeline** (skill-creator) | User wants to create a new skill with formal quality gates, phase structure, and integration. | DISCOVER → DESIGN → SCAFFOLD → VALIDATE → INTEGRATE | | **hook-development-pipeline** (hook-development-engineer) | User wants to create a new hook with formal spec, performance testing, and registration. | SPEC → IMPLEMENT → TEST → REGISTER → DOCUMENT | | **research-pipeline** (research-coordinator-engineer) | User wants formal research with saved artifacts, multiple sources, and a synthesized deliverable. NOT: a quick lookup or single-source check. | SCOPE → GATHER → SYNTHESIZE → VALIDATE → DELIVER | -| **agent-upgrade** (skill-creator-engineer) | User wants to audit and improve a specific agent to bring it up to current template standards. | AUDIT → DIFF → PLAN → IMPLEMENT → RE-EVALUATE | +| **agent-upgrade** (skill-creator) | User wants to audit and improve a specific agent to bring it up to current template standards. | AUDIT → DIFF → PLAN → IMPLEMENT → RE-EVALUATE | | **research-to-article** | User wants to research a topic and turn the findings into a written article. | RESEARCH → COMPILE → GROUND → GENERATE → VALIDATE → REFINE → OUTPUT | | **doc-pipeline** | User wants to generate documentation for a codebase, create a README, or write technical docs from scratch. | RESEARCH → OUTLINE → GENERATE → VERIFY → OUTPUT | | **pr-pipeline** | User wants the full structured PR workflow with review gates. | CLASSIFY → STAGE → REVIEW → COMMIT → PUSH → CREATE → VERIFY → CLEANUP | @@ -376,10 +376,10 @@ Invoked via the roast skill or directly: | "research then write article" | research-to-article pipeline | Research-backed content creation | | "create a pipeline for X" | pipeline-orchestrator-engineer + pipeline-scaffolder | Pipeline creation | | "upgrade system for new Claude version" | system-upgrade-engineer + system-upgrade | System-wide upgrade | -| "create skill with quality gates" | skill-creator-engineer + skill-creation-pipeline | Formal skill creation | +| "create skill with quality gates" | skill-creator + skill-creation-pipeline | Formal skill creation | | "create hook (formal, with perf test)" | hook-development-engineer + hook-development-pipeline | Formal hook creation | | "research with saved artifacts" | research-coordinator-engineer + research-pipeline | Formal research pipeline | -| "upgrade this specific agent" | skill-creator-engineer + agent-upgrade | Single agent improvement | +| "upgrade this specific agent" | skill-creator + agent-upgrade | Single agent improvement | | "create a 3D scene" | typescript-frontend-engineer + threejs-builder | Frontend domain, 3D task | | "generate image with Python" | python-general-engineer + gemini-image-generator | Python domain, image generation | | "extract coding rules from github user X" | github-profile-rules-engineer + github-profile-rules | Profile analysis | diff --git a/skills/routing-table-updater/SKILL.md b/skills/routing-table-updater/SKILL.md index f55973e..0a51a76 100644 --- a/skills/routing-table-updater/SKILL.md +++ b/skills/routing-table-updater/SKILL.md @@ -3,7 +3,7 @@ name: routing-table-updater description: | Maintain /do routing tables and command references when skills or agents are added, modified, or removed. Use when skill/agent metadata changes, - after skill-creator-engineer or agent-creator-engineer runs, or when + after skill-creator or agent-creator-engineer runs, or when routing tables need synchronization. Use for "update routes", "sync routing", "routing table", or "refresh /do". Do NOT use for creating new skills/agents, modifying skill logic, or manual /do table edits. @@ -262,7 +262,7 @@ If gate fails: ### Example 1: New Skill Created -User creates `skills/api-integration-helper/SKILL.md` via skill-creator-engineer: +User creates `skills/api-integration-helper/SKILL.md` via skill-creator: ```yaml --- @@ -375,7 +375,7 @@ The scaffolder provides a component list (from the Pipeline Spec): | Scan | All skills/* and agents/* | Only listed components | | Conflict check | Against existing entries | Against existing AND within batch | | OUTPUT | One entry at a time | N entries in one pass | -| Invoked by | skill-creator-engineer, agent-creator-engineer | pipeline-scaffolder Phase 4 | +| Invoked by | skill-creator, agent-creator-engineer | pipeline-scaffolder Phase 4 | --- @@ -383,7 +383,7 @@ The scaffolder provides a component list (from the Pipeline Spec): This skill is typically invoked after other creation skills complete: -- **After skill-creator-engineer**: New skill created, routing tables need updated entry +- **After skill-creator**: New skill created, routing tables need updated entry - **After agent-creator-engineer**: New agent created, domain routing needs expansion - **After skill/agent modification**: Description or trigger changes require routing refresh - **During repository maintenance**: Periodic sync to catch manual drift diff --git a/skills/shared-patterns/pipeline-architecture.md b/skills/shared-patterns/pipeline-architecture.md index f182873..2f71830 100644 --- a/skills/shared-patterns/pipeline-architecture.md +++ b/skills/shared-patterns/pipeline-architecture.md @@ -215,7 +215,7 @@ Define Requirements Add to Routing ``` -**Skill**: `agent-creator-engineer` or `skill-creator-engineer` +**Skill**: `agent-creator-engineer` or `skill-creator` --- diff --git a/skills/skill-creator/SKILL.md b/skills/skill-creator/SKILL.md new file mode 100644 index 0000000..7bab347 --- /dev/null +++ b/skills/skill-creator/SKILL.md @@ -0,0 +1,390 @@ +--- +name: skill-creator +description: | + Create new skills and iteratively improve them through eval-driven validation. + Draft a skill, test it against real prompts, review the outputs, improve based + on measured results, repeat. Use when creating new skills, improving existing + skills, testing skill quality, or optimizing descriptions for triggering accuracy. + Use for "create skill", "new skill", "test skill", "improve skill", "optimize + description", "skill eval", "turn this into a skill". Do NOT use for agent + creation (use agent-creator-engineer) or hook development (use + hook-development-engineer). +version: 2.0.0 +routing: + triggers: + - create skill + - new skill + - skill template + - skill design + - test skill + - improve skill + - optimize description + - skill eval + pairs_with: + - agent-evaluation + - verification-before-completion + complexity: Complex + category: meta +allowed-tools: + - Read + - Edit + - Write + - Bash + - Glob + - Grep + - Agent +--- + +# Skill Creator + +Create skills and iteratively improve them through measurement. + +The process: + +- Decide what the skill should do and how it should work +- Write a draft of the skill +- Create test prompts and run claude-with-the-skill on them +- Evaluate the results — both with agent reviewers and optionally human review +- Improve the skill based on what the evaluation reveals +- Repeat until the skill demonstrably helps + +Figure out where the user is in this process and help them progress. If they say +"I want to make a skill for X", help narrow scope, write a draft, write test cases, +and run the eval loop. If they already have a draft, go straight to testing. + +--- + +## Creating a skill + +### Capture intent + +Start by understanding what the user wants. The current conversation might already +contain a workflow worth capturing ("turn this into a skill"). If so, extract: + +1. What should this skill enable Claude to do? +2. When should this skill trigger? (what user phrases, what contexts) +3. What is the expected output? +4. Are the outputs objectively verifiable (code, data transforms, structured files) + or subjective (writing quality, design aesthetics)? Objectively verifiable outputs + benefit from test cases. Subjective outputs are better evaluated by human review. + +### Research + +Check for existing skills that overlap — run `grep -r "trigger-keyword" skills/*/SKILL.md` +to avoid duplicating what already exists. If a similar skill exists, offer to improve +it rather than create a new one. + +Read the repository CLAUDE.md before writing anything. Project conventions override +default patterns. + +### Write the SKILL.md + +Based on the user interview, create the skill directory and write the SKILL.md. + +**Skill structure:** + +``` +skill-name/ +├── SKILL.md # Required — the workflow +├── scripts/ # Deterministic CLI tools the skill invokes +├── agents/ # Subagent prompts used only by this skill +├── references/ # Deep context loaded on demand +└── assets/ # Templates, viewers, static files +``` + +**Frontmatter** — name, description, routing metadata: + +```yaml +--- +name: skill-slug-name +description: | + [What it does — 1-2 sentences]. Use when [trigger conditions]. + Use for "[phrase 1]", "[phrase 2]". Do NOT use for [exclusions]. +version: 1.0.0 +routing: + triggers: + - keyword1 + - keyword2 + pairs_with: + - related-skill + complexity: Simple | Medium | Complex + category: language | infrastructure | review | meta | content +allowed-tools: + - Read + - Write + - Bash +--- +``` + +The description is the primary triggering mechanism. Claude tends to undertrigger +skills — not activating them when they would help. Combat this by being explicit +about trigger contexts. Include "Use for" with concrete phrases users would say. + +**Body** — workflow first, then context: + +1. Brief overview (2-3 sentences: what this does and how) +2. Instructions / workflow phases (the actual methodology) +3. Reference material (commands, guides, schemas) +4. Error handling (cause/solution pairs for common failures) +5. References to bundled files + +Constraints belong inline within the workflow step where they apply, not in a +separate section. If a constraint matters during Phase 2, put it in Phase 2 — +not in a preamble the model reads 200 lines before it reaches Phase 2. + +Explain the reasoning behind constraints rather than issuing bare imperatives. +"Run with `-race` because race conditions are silent until production" is more +effective than "ALWAYS run with -race" because the model can generalize the +reasoning to situations the skill author didn't anticipate. + +**Progressive disclosure** — keep SKILL.md navigable: +- Summary in frontmatter, workflow in body, deep reference in `references/` +- If SKILL.md exceeds ~500 lines, move detailed catalogs to reference files +- Reference files clearly linked from SKILL.md with guidance on when to read them + +### Bundled scripts + +Extract deterministic, repeatable operations into `scripts/*.py` CLI tools with +argparse interfaces. Scripts save tokens (the model doesn't reinvent the wheel +each invocation), ensure consistency across runs, and can be tested independently. + +Pattern: `scripts/` for deterministic ops, SKILL.md for LLM-orchestrated workflow. + +### Bundled agents + +For skills that spawn subagents with specialized roles, bundle agent prompts in +`agents/`. These are not registered in the routing system — they are internal to +the skill's workflow. + +| Scenario | Approach | +|----------|----------| +| Agent used only by this skill | Bundle in `agents/` | +| Agent shared across skills | Keep in repo `agents/` directory | +| Agent needs routing metadata | Keep in repo `agents/` directory | + +--- + +## Testing the skill + +This is the core of the eval loop. Do not stop after writing — test the skill +against real prompts and measure whether it actually helps. + +### Create test prompts + +Write 2-3 realistic test prompts — the kind of thing a real user would say. Rich, +detailed, specific. Not abstract one-liners. + +Bad: `"Format this data"` +Good: `"I have a CSV in ~/downloads/q4-sales.csv with revenue in column C and costs +in column D. Add a profit margin percentage column and highlight rows where margin +is below 10%."` + +Share prompts with the user for review before running them. + +Save test cases to `evals/evals.json` in the workspace (not in the skill directory — +eval data is ephemeral): + +```json +{ + "skill_name": "example-skill", + "evals": [ + { + "id": 1, + "name": "descriptive-name", + "prompt": "The realistic user prompt", + "assertions": [] + } + ] +} +``` + +### Run test prompts + +For each test case, spawn two subagents in the same turn — one with the skill +loaded, one without (baseline). Launch everything at once so it finishes together. + +**With-skill run:** Tell the subagent to read the skill's SKILL.md first, then +execute the task. Save outputs to the workspace. + +**Baseline run:** Same prompt, no skill loaded. Save to a separate directory. + +Organize results by iteration: + +``` +skill-workspace/ +├── evals/evals.json +├── iteration-1/ +│ ├── eval-descriptive-name/ +│ │ ├── with_skill/outputs/ +│ │ ├── without_skill/outputs/ +│ │ └── grading.json +│ └── benchmark.json +└── iteration-2/ + └── ... +``` + +### Evaluate results + +Evaluation has three tiers, applied in order: + +**Tier 1: Deterministic checks** — run automatically where applicable: +- Does the code compile? (`go build`, `tsc --noEmit`, `python -m py_compile`) +- Do tests pass? (`go test -race`, `pytest`, `vitest`) +- Does the linter pass? (`go vet`, `ruff`, `biome`) + +**Tier 2: Agent blind review** — dispatch using `agents/comparator.md`: +- Comparator receives both outputs labeled "Output 1" / "Output 2" +- It does NOT know which is the skill version +- Scores on relevant dimensions, picks a winner with reasoning +- Save results to `blind_comparison.json` + +**Tier 3: Human review (optional)** — generate the comparison viewer: +```bash +python3 scripts/eval_compare.py path/to/workspace +open path/to/workspace/compare_report.html +``` + +The viewer shows outputs side by side with blind labels, agent review panels, +deterministic check results, winner picker, feedback textarea, and a +skip-to-results option. Human reviews are optional — agent reviews are sufficient +for iteration. + +### Draft assertions + +While test runs are in progress, draft quantitative assertions for objective +criteria. Good assertions are discriminating — they fail when the skill doesn't +help and pass when it does. Non-discriminating assertions ("file exists") provide +false confidence. + +Run the grader (`agents/grader.md`) to evaluate assertions against outputs: +- PASS requires genuine substance, not surface compliance +- The grader also critiques the assertions themselves — flagging ones that would + pass regardless of skill quality + +Aggregate results with `scripts/aggregate_benchmark.py` to get pass rates, +timing, and token usage with mean/stddev across runs. + +--- + +## Improving the skill + +This is the iterative heart of the process. + +**Generalize from feedback.** Skills will be used across many prompts, not just +test cases. If a fix only helps the test case but wouldn't generalize, it's +overfitting. Try different approaches rather than fiddly adjustments. + +**Keep instructions lean.** Read the execution transcripts, not just the final +outputs. If the skill causes the model to waste time on unproductive work, remove +those instructions. Instructions that don't pull their weight hurt more than they +help — they consume attention budget without producing value. + +**Explain the reasoning.** Motivation-based instructions generalize better than +rigid imperatives. "Prefer table-driven tests because they make adding cases +trivial and the input-output relationship explicit" works better than "MUST use +table-driven tests" because the model understands when the pattern applies and +when it doesn't. + +**Extract repeated work.** Read the transcripts from test runs. If all subagents +independently wrote similar helper scripts or took the same multi-step approach, +bundle that script in `scripts/`. One shared implementation beats N independent +reinventions. + +### The iteration loop + +1. Apply improvements to the skill +2. Rerun all test cases into `iteration-/`, including baselines +3. Generate the comparison viewer with `--previous-workspace` pointing at the + prior iteration +4. Review — agent or human +5. Repeat until results plateau or the user is satisfied + +Stop iterating when: +- Feedback is empty (outputs look good) +- Pass rates aren't improving between iterations +- The user says they're satisfied + +--- + +## Description optimization + +The description field determines whether Claude activates the skill. After the +skill is working well, optimize the description for triggering accuracy. + +Generate 20 eval queries — 10 that should trigger, 10 that should not. The +should-not queries are the most important: they should be near-misses from +adjacent domains, not obviously irrelevant queries. + +Run the optimization loop: +```bash +python3 scripts/optimize_description.py \ + --skill-path path/to/skill \ + --eval-set evals/trigger-eval.json \ + --max-iterations 5 +``` + +This splits queries 60/40 train/test, evaluates the current description (3 runs +per query for reliability), proposes improvements based on failures, and selects +the best description by test-set score to avoid overfitting. + +--- + +## Bundled agents + +The `agents/` directory contains prompts for specialized subagents used by this +skill. Read them when you need to spawn the relevant subagent. + +- `agents/grader.md` — Evaluate assertions against outputs with cited evidence +- `agents/comparator.md` — Blind A/B comparison of two outputs +- `agents/analyzer.md` — Post-hoc analysis of why one version beat another + +--- + +## Bundled scripts + +- `scripts/run_eval.py` — Execute a skill against a test prompt via `claude -p` +- `scripts/aggregate_benchmark.py` — Compute pass rate statistics across runs +- `scripts/optimize_description.py` — Train/test description optimization loop +- `scripts/package_results.py` — Consolidate iteration artifacts into a report +- `scripts/eval_compare.py` — Generate blind comparison HTML viewer + +--- + +## Reference files + +- `references/artifact-schemas.md` — JSON schemas for eval artifacts (evals.json, + grading.json, benchmark.json, comparison.json, timing.json, metrics.json) +- `references/skill-template.md` — Complete SKILL.md template with all sections +- `references/complexity-tiers.md` — Skill examples by complexity tier +- `references/workflow-patterns.md` — Reusable phase structures and gate patterns +- `references/error-catalog.md` — Common skill creation errors with solutions + +--- + +## Error handling + +### Skill doesn't trigger when it should +Cause: Description is too vague or missing trigger phrases +Solution: Add explicit "Use for" phrases matching what users actually say. +Test with `scripts/optimize_description.py`. + +### Test run produces empty output +Cause: The `claude -p` subprocess didn't load the skill, or the skill path is wrong +Solution: Verify the skill directory contains SKILL.md (exact case). Check +the `--skill-path` argument points to the directory, not the file. + +### Grading results show all-pass regardless of skill +Cause: Assertions are non-discriminating (e.g., "file exists") +Solution: Write assertions that test behavior, not structure. The grader's +eval critique section flags these — read it. + +### Iteration loop doesn't converge +Cause: Changes are overfitting to test cases rather than improving the skill +Solution: Expand the test set with more diverse prompts. Focus improvements +on understanding WHY outputs differ, not on patching specific failures. + +### Description optimization overfits to train set +Cause: Test set is too small or train/test queries are too similar +Solution: Ensure should-trigger and should-not-trigger queries are realistic +near-misses, not obviously different. The 60/40 split guards against this, +but only if the queries are well-designed. diff --git a/skills/skill-creator/agents/analyzer.md b/skills/skill-creator/agents/analyzer.md new file mode 100644 index 0000000..e4665e2 --- /dev/null +++ b/skills/skill-creator/agents/analyzer.md @@ -0,0 +1,109 @@ +# Analyzer Agent + +You are a post-hoc analysis agent for eval pipelines. You operate after unblinding — +you know which output was produced with the skill and which without. Your role is to +produce actionable improvement suggestions based on the full picture of evidence. + +## Modes + +You operate in one of two modes, specified in the input: + +### Mode: comparison + +**When to use**: After a single eval's blind comparison has been completed and unblinded. + +**Inputs**: +- `comparison_json`: Path to comparison.json from the comparator agent +- `skill_a_path` or `skill_b_path`: Which label (A or B) corresponds to with_skill +- `with_skill_transcript`: Path to with_skill/transcript.md +- `without_skill_transcript`: Path to without_skill/transcript.md +- `with_skill_outputs_dir`: Path to with_skill/outputs/ +- `without_skill_outputs_dir`: Path to without_skill/outputs/ + +**Analysis tasks**: +1. Identify WHY the winner won (specific criterion advantages) +2. Identify WHERE the loser can improve (specific, actionable suggestions) +3. If the skill won: identify what instructions produced the winning behavior so they + can be strengthened +4. If the skill lost: identify which instructions caused harm or were simply ineffective +5. Check if the skill caused unnecessary work in the transcript (unproductive loops, + redundant steps, ignored instructions) + +### Mode: benchmark + +**When to use**: After an iteration's full benchmark has been computed. + +**Inputs**: +- `benchmark_json`: Path to iteration's benchmark.json +- `all_grading_jsons`: List of paths to all grading.json files in the iteration +- `all_comparison_jsons`: List of paths to all comparison.json files in the iteration + +**Analysis tasks**: +1. Identify patterns across all evals (which assertion types consistently fail?) +2. Flag non-discriminating assertions that appeared in multiple evals +3. Identify high-variance evals (comparator score spreads, grading inconsistencies) +4. Surface metric outliers (evals with unusually high token cost or duration) +5. Produce 3-5 prioritized improvement suggestions for the skill + +## Output + +Produce a JSON file named `analysis.json` with exactly this structure: + +```json +{ + "mode": "comparison | benchmark", + "timestamp": "ISO 8601 timestamp", + "skill_won": "boolean — true if with_skill won (comparison mode) or pass_rate delta > 0 (benchmark mode)", + "findings": [ + { + "category": "winner_factors | loser_improvements | instruction_analysis | transcript_waste | assertion_quality | metric_outliers | variance", + "priority": "high | medium | low", + "finding": "specific observation with cited evidence", + "actionable_suggestion": "concrete change to make to the skill or eval" + } + ], + "improvements_for_skill": [ + { + "target": "which section/instruction to change", + "current_behavior": "what the skill currently does", + "desired_behavior": "what it should do instead", + "rationale": "why this change would improve results", + "generalization_risk": "low | medium | high — risk of overfitting this change to test cases" + } + ], + "improvements_for_evals": [ + { + "assertion": "the assertion to improve or replace", + "problem": "why this assertion is weak or non-discriminating", + "replacement": "suggested replacement assertion text" + } + ], + "benchmark_summary": { + "with_skill_pass_rate_mean": "float — benchmark mode only", + "without_skill_pass_rate_mean": "float — benchmark mode only", + "delta": "float — with_skill minus without_skill", + "comparator_win_rate": "float — fraction of evals where skill won", + "top_failure_categories": ["list of assertion categories that frequently fail"] + }, + "analyzer_notes": "optional string — observations that do not fit the structured fields" +} +``` + +The schema is a contract. Field names, types, and nesting must match exactly. The +`package_results.py` script reads `findings`, `improvements_for_skill`, and +`benchmark_summary` by field name. + +## Behavior Rules + +- Every finding must cite specific evidence. "The skill seems to help" is not a finding. + "The skill produced a YAML frontmatter with 7 required fields; without-skill produced + 3" is a finding. +- `generalization_risk` is mandatory for every improvement_for_skill entry. High risk + means the change would only help on the specific test case and would likely confuse + the model on unseen prompts. +- In benchmark mode, if `delta` is near zero (within 0.05), investigate whether the + assertions are non-discriminating before concluding the skill is ineffective. +- Prioritize `improvements_for_skill` by expected impact. High priority means the change + would plausibly improve pass rate by more than 10 percentage points. +- Do not suggest adding more instructions as a default. If the skill is not helping, + removing instructions (reducing noise) is often more effective than adding them. diff --git a/skills/skill-creator/agents/comparator.md b/skills/skill-creator/agents/comparator.md new file mode 100644 index 0000000..9ff7361 --- /dev/null +++ b/skills/skill-creator/agents/comparator.md @@ -0,0 +1,118 @@ +# Comparator Agent + +You are a blind A/B comparison agent for eval pipelines. You receive two sets of execution +outputs labeled A and B. You do not know which skill produced which output. Your role is +to produce a scored comparison without knowing the answer — this prevents confirmation bias +from affecting the verdict. + +## Inputs + +You will receive: +- `output_a_dir`: Path to the first execution's outputs directory +- `output_b_dir`: Path to the second execution's outputs directory +- `transcript_a`: Path to the first execution's transcript.md +- `transcript_b`: Path to the second execution's transcript.md +- `assertions` (optional): Assertion list from evals.json, as a secondary signal + +## Process + +### Step 1: Read all artifacts without bias + +Read all output files and transcripts for both A and B. Do not attempt to determine which +is "with skill" and which is "without skill." Treat them as two independent submissions +competing on quality. + +### Step 2: Generate a rubric + +Before scoring, write a rubric with 4-6 evaluation criteria. Criteria must be grounded in +the actual content — do not use generic criteria like "quality" without defining what +quality means for this specific type of output. + +Example criteria for a SKILL.md creation eval: +- Frontmatter completeness (required fields present and populated) +- Phase structure quality (phases have clear inputs, outputs, and gate conditions) +- Instruction specificity (steps are actionable, not aspirational) +- Error handling coverage (top errors covered with cause/solution pairs) +- Anti-rationalization presence and quality + +### Step 3: Score both outputs + +For each criterion, assign a score from 1 to 5: +- 5: Excellent — exceeds expectations with specific, substantive content +- 4: Good — meets expectations consistently +- 3: Adequate — meets minimum requirements with some gaps +- 2: Weak — below expectations, significant gaps +- 1: Poor — fails to meet basic requirements + +Score A and B independently for each criterion. Do not adjust one score based on the +other — each score must stand alone against the rubric. + +### Step 4: Check assertions (secondary signal) + +If assertions were provided, evaluate each output against them. This is a secondary +signal to the rubric scores, not a replacement. A high assertion pass rate with low +rubric scores indicates weak assertions. + +### Step 5: Determine winner + +Compute total rubric scores for A and B. The higher total is the winner. If scores are +tied within 2 points, classify as "tie." Include the overall scores (1-10 scale, where +10 is perfect across all criteria at weight 2 each). + +## Output + +Produce a JSON file named `comparison.json` with exactly this structure: + +```json +{ + "eval_id": "string — the eval name/identifier", + "timestamp": "ISO 8601 timestamp", + "rubric": [ + { + "criterion": "criterion name", + "description": "what this criterion measures", + "weight": "float — relative importance, all weights sum to 1.0" + } + ], + "scores": { + "A": { + "criteria_scores": [ + { + "criterion": "criterion name", + "score": "integer 1-5", + "rationale": "specific evidence for this score" + } + ], + "total_score": "float — weighted sum of criteria scores normalized to 1-10", + "assertion_pass_rate": "float 0.0–1.0 — if assertions provided, else null" + }, + "B": { + "criteria_scores": [], + "total_score": "float", + "assertion_pass_rate": "float or null" + } + }, + "winner": "A | B | tie", + "winner_margin": "float — difference in total scores", + "reasoning": "string — 2-4 sentences explaining the decision, referencing specific criterion differences", + "confidence": "high | medium | low", + "comparator_notes": "optional — observations about the comparison that don't fit the rubric" +} +``` + +The schema is a contract. Field names, types, and nesting must match exactly. The +`analyzer.md` agent reads `winner`, `total_score`, and `reasoning` by field name. + +## Behavior Rules + +- Never attempt to determine which output is "with skill" or "without skill." You will + be unblinded by the analyzer agent after this step. +- Never use "quality" or "better" as criterion names without defining what they mean for + this specific content type. +- Each `rationale` must cite specific content from the output, not general impressions. + "A's error handling section covers 5 specific errors with cause/solution pairs" is + acceptable. "A's error handling seems more thorough" is not. +- If both outputs are identical or near-identical, set `winner` to "tie" and note this + in `comparator_notes`. +- If one output is clearly empty or failed, score all criteria 1 and set winner to + the non-empty output. Note the failure in `comparator_notes`. diff --git a/skills/skill-creator/agents/grader.md b/skills/skill-creator/agents/grader.md new file mode 100644 index 0000000..9665022 --- /dev/null +++ b/skills/skill-creator/agents/grader.md @@ -0,0 +1,105 @@ +# Grader Agent + +You are a grading agent for eval pipelines. Your role is to evaluate whether execution +outputs satisfy a set of assertions, producing cited evidence for every verdict. + +## Inputs + +You will receive: +- `expectations`: A list of assertion strings from `evals.json` +- `transcript_path`: Path to `transcript.md` from the execution run +- `outputs_dir`: Path to the `outputs/` directory from the execution run + +## Process + +### Step 1: Read all artifacts + +Read `transcript.md` in full. Read all files in `outputs/`. Build a complete picture of +what the execution produced before evaluating any assertion. + +### Step 2: Evaluate each assertion + +For each assertion in `expectations`: + +1. Determine whether it is PASS or FAIL based on the artifacts. +2. Cite specific evidence: quote the relevant section of transcript.md or the relevant + content from an output file. Do not assert PASS without pointing to the specific + content that satisfies the assertion. +3. If the assertion is ambiguous (could be interpreted in multiple ways), apply the + stricter interpretation and note the ambiguity. + +**Key rule**: PASS requires genuine substance, not surface compliance. Examples: +- Correct filename with wrong content → FAIL +- Correct structure with placeholder values → FAIL +- Required field present but empty → FAIL +- Required section heading present but no content under it → FAIL + +### Step 3: Extract and verify implicit claims + +After evaluating explicit assertions, scan the outputs for implicit claims — statements +or artifacts that appear to assert something specific. Verify 2-3 of the most significant +implicit claims. These are not scored against the pass rate but are included in the report +for the analyzer agent. + +### Step 4: Critique eval quality + +Identify non-discriminating assertions: assertions that would PASS regardless of whether +the skill was loaded. Flag these clearly because they inflate pass rates without measuring +skill-specific behavior. + +Examples of non-discriminating assertions: +- "Output is in English" +- "No error messages present" +- "Response is non-empty" +- "File exists" (if any execution would produce a file) + +## Output + +Produce a JSON file named `grading.json` with exactly this structure: + +```json +{ + "eval_id": "string — the eval name/identifier", + "configuration": "with_skill | without_skill", + "timestamp": "ISO 8601 timestamp", + "assertions": [ + { + "assertion": "the assertion text", + "verdict": "PASS | FAIL", + "evidence": "quoted excerpt or file reference supporting the verdict", + "confidence": "high | medium | low" + } + ], + "pass_count": "integer — number of PASS verdicts", + "fail_count": "integer — number of FAIL verdicts", + "pass_rate": "float 0.0–1.0", + "implicit_claims": [ + { + "claim": "the implicit claim identified", + "verdict": "VERIFIED | UNVERIFIED | CONTRADICTED", + "evidence": "supporting or contradicting evidence" + } + ], + "eval_critique": { + "non_discriminating_assertions": ["list of assertion texts flagged as non-discriminating"], + "recommendation": "string — suggested assertion improvements" + }, + "grader_notes": "optional string — any observations about unusual execution patterns" +} +``` + +The schema is a contract. Field names, types, and nesting must match exactly. The +`aggregate_benchmark.py` script parses `pass_rate`, `pass_count`, and `fail_count` +by name. + +## Behavior Rules + +- Never infer PASS from ambiguous evidence. When in doubt, FAIL with a note explaining + what evidence would be needed for PASS. +- Never skip an assertion. Every assertion in `expectations` must appear in `assertions`. +- The `evidence` field must contain a direct quote or file path reference. "Looks correct" + is not evidence. +- If `outputs/` is empty, all file-existence assertions are FAIL. Note this prominently + in `grader_notes`. +- If `transcript.md` contains error messages from the execution, note them in + `grader_notes` even if no assertion directly tests for errors. diff --git a/skills/skill-creator/assets/eval_viewer.html b/skills/skill-creator/assets/eval_viewer.html new file mode 100644 index 0000000..636532b --- /dev/null +++ b/skills/skill-creator/assets/eval_viewer.html @@ -0,0 +1,1189 @@ + + + + + +Blind A/B Code Review + + + + +
+
+

Blind A/B Code Review

+ Blind Mode +
+
+ + + +
+
+
+
+
+ + + + diff --git a/agents/skill-creator-engineer/references/anti-patterns.md b/skills/skill-creator/references/anti-patterns.md similarity index 100% rename from agents/skill-creator-engineer/references/anti-patterns.md rename to skills/skill-creator/references/anti-patterns.md diff --git a/skills/skill-creator/references/artifact-schemas.md b/skills/skill-creator/references/artifact-schemas.md new file mode 100644 index 0000000..98eac8b --- /dev/null +++ b/skills/skill-creator/references/artifact-schemas.md @@ -0,0 +1,302 @@ +# Artifact Schemas + +JSON contracts for all eval pipeline artifacts. Field names, types, and nesting are +contracts between producers and consumers. Downstream scripts parse by field name — +do not rename fields without updating all consumers. + +## Producer/Consumer Map + +| Schema | Producer | Consumer(s) | +|--------|----------|-------------| +| `evals.json` | Skill creator (human) | `run_eval.py`, grader agent | +| `grading.json` | grader agent | `aggregate_benchmark.py`, analyzer agent | +| `benchmark.json` | `aggregate_benchmark.py` | analyzer agent, `package_results.py` | +| `comparison.json` | comparator agent | analyzer agent | +| `analysis.json` | analyzer agent | `package_results.py`, skill creator | +| `timing.json` | `run_eval.py` | `aggregate_benchmark.py` | +| `metrics.json` | `run_eval.py` | grader agent | +| `eval_metadata.json` | `run_eval.py` | grader agent, comparator agent | +| `trigger-eval.json` | Skill creator (human) | `optimize_description.py` | + +--- + +## evals.json + +Location: `skill-workspace/evals/evals.json` + +```json +[ + { + "eval_id": "string — unique identifier for this eval, used as directory name", + "prompt": "string — the test prompt text passed to claude -p", + "assertions": [ + "string — one assertion per entry, binary and evidence-checkable" + ], + "metadata": { + "description": "string — optional human-readable description of what this eval tests", + "tags": ["optional array of tags for filtering"] + } + } +] +``` + +**Rules**: +- `eval_id` must be a valid directory name (kebab-case recommended) +- Each assertion must be binary: it either passes or fails, with evidence +- Assertions should test skill-specific behavior, not generic output properties + +--- + +## grading.json + +Location: `skill-workspace/iteration-N/{eval-id}/grading.json` + +```json +{ + "eval_id": "string — matches the eval_id from evals.json", + "configuration": "string — 'with_skill' or 'without_skill'", + "timestamp": "string — ISO 8601 timestamp", + "assertions": [ + { + "assertion": "string — the assertion text from evals.json", + "verdict": "string — 'PASS' or 'FAIL'", + "evidence": "string — quoted excerpt or file reference", + "confidence": "string — 'high', 'medium', or 'low'" + } + ], + "pass_count": "integer", + "fail_count": "integer", + "pass_rate": "float — range 0.0 to 1.0", + "implicit_claims": [ + { + "claim": "string", + "verdict": "string — 'VERIFIED', 'UNVERIFIED', or 'CONTRADICTED'", + "evidence": "string" + } + ], + "eval_critique": { + "non_discriminating_assertions": ["array of assertion text strings"], + "recommendation": "string" + }, + "grader_notes": "string or null" +} +``` + +**Required fields for `aggregate_benchmark.py`**: `pass_rate`, `pass_count`, `fail_count` + +--- + +## benchmark.json + +Location: `skill-workspace/iteration-N/benchmark.json` + +```json +{ + "skill_name": "string", + "workspace": "string — absolute path", + "timestamp": "string — ISO 8601", + "eval_count": "integer", + "with_skill": { + "pass_rate": { + "mean": "float", + "stddev": "float", + "min": "float", + "max": "float" + }, + "tokens": { + "mean": "float", + "stddev": "float" + }, + "time_seconds": { + "mean": "float", + "stddev": "float" + } + }, + "without_skill": { + "pass_rate": { "mean": "float", "stddev": "float", "min": "float", "max": "float" }, + "tokens": { "mean": "float", "stddev": "float" }, + "time_seconds": { "mean": "float", "stddev": "float" } + }, + "delta": { + "pass_rate": "float or null — with_skill minus without_skill", + "description": "string — human-readable interpretation" + }, + "eval_results": [ + { + "eval_id": "string", + "configuration": "string", + "pass_rate": "float", + "pass_count": "integer", + "fail_count": "integer", + "without_skill_pass_rate": "float or null", + "with_skill_tokens": "integer", + "with_skill_duration": "float", + "without_skill_tokens": "integer", + "without_skill_duration": "float" + } + ] +} +``` + +**Required fields for analyzer agent**: `with_skill.pass_rate.mean`, +`without_skill.pass_rate.mean`, `delta.pass_rate` + +--- + +## comparison.json + +Location: `skill-workspace/iteration-N/{eval-id}/comparison.json` + +```json +{ + "eval_id": "string", + "timestamp": "string — ISO 8601", + "rubric": [ + { + "criterion": "string", + "description": "string", + "weight": "float — all weights sum to 1.0" + } + ], + "scores": { + "A": { + "criteria_scores": [ + { + "criterion": "string — must match rubric criterion name", + "score": "integer — 1 to 5", + "rationale": "string — specific evidence" + } + ], + "total_score": "float — weighted sum normalized to 1-10 scale", + "assertion_pass_rate": "float or null" + }, + "B": { + "criteria_scores": [], + "total_score": "float", + "assertion_pass_rate": "float or null" + } + }, + "winner": "string — 'A', 'B', or 'tie'", + "winner_margin": "float — absolute difference in total_score", + "reasoning": "string — 2-4 sentences with specific criterion references", + "confidence": "string — 'high', 'medium', or 'low'", + "comparator_notes": "string or null" +} +``` + +**Required fields for analyzer agent**: `winner`, `scores.A.total_score`, +`scores.B.total_score`, `reasoning` + +--- + +## analysis.json + +Location: `skill-workspace/iteration-N/analysis.json` + +```json +{ + "mode": "string — 'comparison' or 'benchmark'", + "timestamp": "string — ISO 8601", + "skill_won": "boolean", + "findings": [ + { + "category": "string — one of: winner_factors, loser_improvements, instruction_analysis, transcript_waste, assertion_quality, metric_outliers, variance", + "priority": "string — 'high', 'medium', or 'low'", + "finding": "string — specific observation with evidence", + "actionable_suggestion": "string — concrete change" + } + ], + "improvements_for_skill": [ + { + "target": "string — which section/instruction", + "current_behavior": "string", + "desired_behavior": "string", + "rationale": "string", + "generalization_risk": "string — 'low', 'medium', or 'high'" + } + ], + "improvements_for_evals": [ + { + "assertion": "string", + "problem": "string", + "replacement": "string" + } + ], + "benchmark_summary": { + "with_skill_pass_rate_mean": "float or null", + "without_skill_pass_rate_mean": "float or null", + "delta": "float or null", + "comparator_win_rate": "float or null", + "top_failure_categories": ["array of strings"] + }, + "analyzer_notes": "string or null" +} +``` + +**Required fields for `package_results.py`**: `findings`, `improvements_for_skill`, +`benchmark_summary.delta` + +--- + +## timing.json + +Location: `skill-workspace/iteration-N/{eval-id}/{configuration}/timing.json` + +```json +{ + "duration_seconds": "float — wall-clock seconds for the claude -p run", + "tokens_total": "integer — sum of input_tokens and output_tokens", + "timed_out": "boolean — true if the run hit the timeout limit" +} +``` + +Produced by: `run_eval.py` +Consumed by: `aggregate_benchmark.py` + +--- + +## metrics.json + +Location: `skill-workspace/iteration-N/{eval-id}/{configuration}/metrics.json` + +```json +{ + "tool_usage": { + "Read": "integer — number of Read tool calls", + "Write": "integer", + "Edit": "integer", + "Bash": "integer", + "Grep": "integer", + "Glob": "integer", + "Agent": "integer" + }, + "total_tool_calls": "integer — sum of all tool_usage values" +} +``` + +Produced by: `run_eval.py` +Consumed by: grader agent (for context about execution behavior) + +--- + +## trigger-eval.json + +Location: `skill-workspace/evals/trigger-eval.json` + +```json +[ + { + "query": "string — user prompt to test triggering", + "should_trigger": "boolean — true if the skill should activate for this query" + } +] +``` + +**Conventions**: +- Include 10 should_trigger: true entries (vary directness and phrasing) +- Include 10 should_trigger: false entries (near-miss adjacent domains) +- Use realistic prompts with context, not abstract one-liners +- Test edge cases where the skill competes with adjacent skills + +Produced by: Skill creator (human) +Consumed by: `optimize_description.py` diff --git a/agents/skill-creator-engineer/references/complexity-examples.md b/skills/skill-creator/references/complexity-tiers.md similarity index 100% rename from agents/skill-creator-engineer/references/complexity-examples.md rename to skills/skill-creator/references/complexity-tiers.md diff --git a/agents/skill-creator-engineer/references/error-catalog.md b/skills/skill-creator/references/error-catalog.md similarity index 100% rename from agents/skill-creator-engineer/references/error-catalog.md rename to skills/skill-creator/references/error-catalog.md diff --git a/agents/skill-creator-engineer/references/skill-template.md b/skills/skill-creator/references/skill-template.md similarity index 100% rename from agents/skill-creator-engineer/references/skill-template.md rename to skills/skill-creator/references/skill-template.md diff --git a/agents/skill-creator-engineer/references/workflow-patterns.md b/skills/skill-creator/references/workflow-patterns.md similarity index 100% rename from agents/skill-creator-engineer/references/workflow-patterns.md rename to skills/skill-creator/references/workflow-patterns.md diff --git a/skills/skill-creator/scripts/aggregate_benchmark.py b/skills/skill-creator/scripts/aggregate_benchmark.py new file mode 100644 index 0000000..e4795e9 --- /dev/null +++ b/skills/skill-creator/scripts/aggregate_benchmark.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +""" +aggregate_benchmark.py — Compute statistics across eval runs in an iteration workspace. + +Reads grading.json from each eval directory. Computes mean, standard deviation, and +delta (with_skill minus without_skill) for pass_rate, time_seconds, and tokens. + +Produces: + {workspace}/benchmark.json Machine-readable statistics + {workspace}/benchmark.md Human-readable summary +""" + +import argparse +import json +import math +import sys +from datetime import datetime, timezone +from pathlib import Path + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + description="Aggregate benchmark statistics from eval grading results", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + p.add_argument("workspace", help="Path to iteration workspace directory (e.g. skill-workspace/iteration-1)") + p.add_argument("--skill-name", required=True, help="Name of the skill being benchmarked") + return p + + +def find_eval_dirs(workspace: Path) -> list[Path]: + """Find all eval directories that contain grading.json.""" + eval_dirs = [] + for child in sorted(workspace.iterdir()): + if child.is_dir() and (child / "grading.json").exists(): + eval_dirs.append(child) + return eval_dirs + + +def load_grading(eval_dir: Path) -> dict | None: + """Load grading.json from an eval directory.""" + grading_path = eval_dir / "grading.json" + try: + return json.loads(grading_path.read_text()) + except (json.JSONDecodeError, OSError) as e: + print(f"WARNING: Could not load {grading_path}: {e}", file=sys.stderr) + return None + + +def load_timing(eval_dir: Path, configuration: str) -> dict: + """Load timing.json for a given configuration (with_skill or without_skill).""" + timing_path = eval_dir / configuration / "timing.json" + try: + return json.loads(timing_path.read_text()) + except (json.JSONDecodeError, OSError): + return {"duration_seconds": 0.0, "tokens_total": 0} + + +def mean(values: list[float]) -> float: + if not values: + return 0.0 + return sum(values) / len(values) + + +def stddev(values: list[float]) -> float: + if len(values) < 2: + return 0.0 + m = mean(values) + variance = sum((v - m) ** 2 for v in values) / (len(values) - 1) + return math.sqrt(variance) + + +def aggregate(workspace: Path, skill_name: str) -> dict: + eval_dirs = find_eval_dirs(workspace) + if not eval_dirs: + print(f"ERROR: No eval directories with grading.json found in {workspace}", file=sys.stderr) + sys.exit(1) + + with_skill_pass_rates = [] + without_skill_pass_rates = [] + with_skill_tokens = [] + without_skill_tokens = [] + with_skill_durations = [] + without_skill_durations = [] + + eval_results = [] + + for eval_dir in eval_dirs: + grading = load_grading(eval_dir) + if grading is None: + continue + + config = grading.get("configuration") + if config not in ("with_skill", "without_skill"): + print(f"WARNING: {eval_dir.name}/grading.json missing 'configuration' field, skipping", file=sys.stderr) + continue + pass_rate = float(grading.get("pass_rate", 0.0)) + + with_timing = load_timing(eval_dir, "with_skill") + without_timing = load_timing(eval_dir, "without_skill") + + if config == "with_skill": + with_skill_pass_rates.append(pass_rate) + with_skill_tokens.append(float(with_timing.get("tokens_total", 0))) + with_skill_durations.append(float(with_timing.get("duration_seconds", 0))) + else: + without_skill_pass_rates.append(pass_rate) + without_skill_tokens.append(float(without_timing.get("tokens_total", 0))) + without_skill_durations.append(float(without_timing.get("duration_seconds", 0))) + + # Try to load the paired configuration if this is with_skill grading + # (eval dirs may contain only one grading.json; paired data comes from timing files) + without_pass_rate = None + paired_grading_path = eval_dir / "grading_without.json" + if paired_grading_path.exists(): + try: + paired = json.loads(paired_grading_path.read_text()) + without_pass_rate = float(paired.get("pass_rate", 0.0)) + except (json.JSONDecodeError, OSError): + pass + + eval_results.append( + { + "eval_id": eval_dir.name, + "configuration": config, + "pass_rate": pass_rate, + "pass_count": grading.get("pass_count", 0), + "fail_count": grading.get("fail_count", 0), + "without_skill_pass_rate": without_pass_rate, + "with_skill_tokens": with_timing.get("tokens_total", 0), + "with_skill_duration": with_timing.get("duration_seconds", 0), + "without_skill_tokens": without_timing.get("tokens_total", 0), + "without_skill_duration": without_timing.get("duration_seconds", 0), + } + ) + + # Compute aggregates + ws_mean = mean(with_skill_pass_rates) + wos_mean = mean(without_skill_pass_rates) + delta = ws_mean - wos_mean if with_skill_pass_rates and without_skill_pass_rates else None + + benchmark = { + "skill_name": skill_name, + "workspace": str(workspace), + "timestamp": datetime.now(timezone.utc).isoformat(), + "eval_count": len(eval_results), + "with_skill": { + "pass_rate": { + "mean": round(ws_mean, 4), + "stddev": round(stddev(with_skill_pass_rates), 4), + "min": round(min(with_skill_pass_rates), 4) if with_skill_pass_rates else 0.0, + "max": round(max(with_skill_pass_rates), 4) if with_skill_pass_rates else 0.0, + }, + "tokens": { + "mean": round(mean(with_skill_tokens), 1), + "stddev": round(stddev(with_skill_tokens), 1), + }, + "time_seconds": { + "mean": round(mean(with_skill_durations), 2), + "stddev": round(stddev(with_skill_durations), 2), + }, + }, + "without_skill": { + "pass_rate": { + "mean": round(wos_mean, 4), + "stddev": round(stddev(without_skill_pass_rates), 4), + "min": round(min(without_skill_pass_rates), 4) if without_skill_pass_rates else 0.0, + "max": round(max(without_skill_pass_rates), 4) if without_skill_pass_rates else 0.0, + }, + "tokens": { + "mean": round(mean(without_skill_tokens), 1), + "stddev": round(stddev(without_skill_tokens), 1), + }, + "time_seconds": { + "mean": round(mean(without_skill_durations), 2), + "stddev": round(stddev(without_skill_durations), 2), + }, + }, + "delta": { + "pass_rate": round(delta, 4) if delta is not None else None, + "description": "with_skill minus without_skill; positive means skill helps", + }, + "eval_results": eval_results, + } + + return benchmark + + +def render_markdown(benchmark: dict) -> str: + ws = benchmark["with_skill"] + wos = benchmark["without_skill"] + delta = benchmark["delta"]["pass_rate"] + delta_str = f"+{delta:.1%}" if delta is not None and delta > 0 else (f"{delta:.1%}" if delta is not None else "N/A") + + lines = [ + f"# Benchmark: {benchmark['skill_name']}\n", + f"**Generated**: {benchmark['timestamp']} \n", + f"**Evals**: {benchmark['eval_count']}\n\n", + "## Pass Rate\n\n", + "| Configuration | Mean | StdDev | Min | Max |\n", + "|--------------|------|--------|-----|-----|\n", + f"| with_skill | {ws['pass_rate']['mean']:.1%} | {ws['pass_rate']['stddev']:.1%} | {ws['pass_rate']['min']:.1%} | {ws['pass_rate']['max']:.1%} |\n", + f"| without_skill | {wos['pass_rate']['mean']:.1%} | {wos['pass_rate']['stddev']:.1%} | {wos['pass_rate']['min']:.1%} | {wos['pass_rate']['max']:.1%} |\n", + f"| **delta** | **{delta_str}** | — | — | — |\n\n", + "## Token Usage\n\n", + "| Configuration | Mean Tokens | StdDev |\n", + "|--------------|-------------|--------|\n", + f"| with_skill | {ws['tokens']['mean']:.0f} | {ws['tokens']['stddev']:.0f} |\n", + f"| without_skill | {wos['tokens']['mean']:.0f} | {wos['tokens']['stddev']:.0f} |\n\n", + "## Duration (seconds)\n\n", + "| Configuration | Mean | StdDev |\n", + "|--------------|------|--------|\n", + f"| with_skill | {ws['time_seconds']['mean']:.1f}s | {ws['time_seconds']['stddev']:.1f}s |\n", + f"| without_skill | {wos['time_seconds']['mean']:.1f}s | {wos['time_seconds']['stddev']:.1f}s |\n\n", + "## Per-Eval Results\n\n", + "| Eval | Config | Pass Rate | Pass | Fail |\n", + "|------|--------|-----------|------|------|\n", + ] + + for er in benchmark["eval_results"]: + lines.append( + f"| {er['eval_id']} | {er['configuration']} | {er['pass_rate']:.1%} | {er['pass_count']} | {er['fail_count']} |\n" + ) + + return "".join(lines) + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + workspace = Path(args.workspace).resolve() + + if not workspace.exists(): + print(f"ERROR: Workspace directory does not exist: {workspace}", file=sys.stderr) + return 1 + + benchmark = aggregate(workspace, args.skill_name) + + benchmark_json = workspace / "benchmark.json" + benchmark_json.write_text(json.dumps(benchmark, indent=2)) + print(f"Written: {benchmark_json}", file=sys.stderr) + + benchmark_md = workspace / "benchmark.md" + benchmark_md.write_text(render_markdown(benchmark)) + print(f"Written: {benchmark_md}", file=sys.stderr) + + delta = benchmark["delta"]["pass_rate"] + if delta is not None: + sign = "+" if delta > 0 else "" + print(f"Pass rate delta: {sign}{delta:.1%} (with_skill vs without_skill)") + else: + print("Pass rate delta: N/A (missing one or both configurations)") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/skill-creator/scripts/eval_compare.py b/skills/skill-creator/scripts/eval_compare.py new file mode 100644 index 0000000..58f1849 --- /dev/null +++ b/skills/skill-creator/scripts/eval_compare.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 +"""Generate blind A/B comparison HTML from eval workspace data. + +Scans workspace, collects output files, runs deterministic checks +(go build, go vet, go test -race where applicable), loads grading +and blind comparison data, injects into compare.html template. +Outputs compare_report.html. + +Usage: + python3 eval_compare.py + python3 eval_compare.py --help +""" + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + description="Generate blind A/B comparison HTML from eval workspace data.", + epilog="Workspace must contain compare.html template and iteration-*/ directories.", + ) + p.add_argument("workspace", type=Path, help="Path to the eval workspace directory") + p.add_argument( + "--output", type=Path, default=None, help="Output HTML path (default: /compare_report.html)" + ) + return p + + +def load_json_safe(path: Path) -> dict | None: + """Load JSON from a file, returning None on any error.""" + try: + return json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError, UnicodeDecodeError) as e: + print(f"WARNING: Could not load {path}: {e}", file=sys.stderr) + return None + + +def read_text_safe(path: Path) -> str: + """Read text file with encoding fallback.""" + try: + return path.read_text(encoding="utf-8", errors="replace") + except OSError: + return "" + + +def find_files(outputs_dir: Path) -> list[str]: + """List all files relative to outputs dir.""" + files = [] + for root, _, filenames in os.walk(outputs_dir): + for f in filenames: + rel = os.path.relpath(Path(root, f), outputs_dir) + files.append(rel) + return sorted(files) + + +def count_go_lines(outputs_dir: Path) -> int: + """Count total lines across all .go files.""" + total = 0 + for root, _, filenames in os.walk(outputs_dir): + for f in filenames: + if f.endswith(".go"): + content = read_text_safe(Path(root, f)) + total += len(content.splitlines()) + return total + + +def get_code_preview(outputs_dir: Path, max_lines: int = 60) -> str: + """Get preview of main .go file content.""" + for root, _, filenames in os.walk(outputs_dir): + for f in sorted(filenames): + if f.endswith(".go") and not f.endswith("_test.go"): + content = read_text_safe(Path(root, f)) + lines = content.splitlines() + if len(lines) > max_lines: + return "\n".join(lines[:max_lines]) + f"\n... ({len(lines) - max_lines} more lines)" + return content + return "" + + +def run_go_check(outputs_dir: Path, cmd: list[str], timeout: int = 30) -> str: + """Run a go command in the outputs directory, return 'yes'/'no'/'clean'/'issues'.""" + # Find the go module root (prefer directory with go.mod) + mod_root = None + go_dirs = [] + for root, _, files in os.walk(outputs_dir): + if "go.mod" in files: + mod_root = root + break + if any(f.endswith(".go") for f in files): + go_dirs.append(root) + + target = mod_root or (go_dirs[0] if go_dirs else None) + if target is None: + return "no_go_files" + + try: + result = subprocess.run(cmd, cwd=target, capture_output=True, text=True, timeout=timeout) + if result.returncode == 0: + return "yes" if "build" in cmd or "test" in cmd else "clean" + return "no" if "build" in cmd or "test" in cmd else "issues" + except (subprocess.TimeoutExpired, FileNotFoundError): + return "skip" + + +def load_grading(variant_dir: Path) -> dict | None: + """Load and normalize grading.json.""" + path = variant_dir / "grading.json" + if not path.exists(): + return None + raw = load_json_safe(path) + if raw is None: + return None + exps = raw.get("expectations", raw.get("assertions", [])) + normalized = [] + for e in exps: + text = e.get("text", e.get("assertion", "?")) + is_pass = e.get("passed") is True or e.get("verdict", "") == "PASS" + evidence = e.get("evidence", "") + normalized.append({"text": text, "passed": is_pass, "evidence": evidence}) + passed = sum(1 for n in normalized if n["passed"]) + tl = raw.get("pass_count") + if tl is not None: + passed = tl + total = len(normalized) + return { + "expectations": normalized, + "summary": { + "passed": passed, + "failed": total - passed, + "total": total, + "pass_rate": round(passed / total, 3) if total > 0 else 0, + }, + } + + +def build_variant_data(variant_dir: Path) -> dict: + """Build data dict for one variant.""" + outputs = variant_dir / "outputs" + if not outputs.exists(): + return {} + files = find_files(outputs) + return { + "lines": count_go_lines(outputs), + "files": files, + "fileCount": len(files), + "code_preview": get_code_preview(outputs), + "compiles": run_go_check(outputs, ["go", "build", "./..."]), + "tests_pass": run_go_check(outputs, ["go", "test", "-race", "-count=1", "./..."]), + "govet": run_go_check(outputs, ["go", "vet", "./..."]), + "grading": load_grading(variant_dir), + } + + +def find_iteration_dirs(workspace: Path) -> list[Path]: + """Find all iteration-N directories, sorted by number.""" + dirs = sorted(workspace.glob("iteration-*")) + return [d for d in dirs if d.is_dir()] + + +def build_data(workspace: Path) -> dict: + """Build full comparison data.""" + evals_path = workspace / "evals" / "evals.json" + evals_meta = {} + evals_raw = None + if evals_path.exists(): + evals_raw = load_json_safe(evals_path) + if evals_raw: + for ev in evals_raw.get("evals", []): + evals_meta[ev.get("name", ev.get("id", ""))] = ev + + evals_data = [] + benchmark = [] + + # Use the latest iteration directory (or iteration-1 as fallback) + iterations = find_iteration_dirs(workspace) + if not iterations: + return { + "evals": [], + "benchmark": [], + "variantAName": "Variant A", + "variantBName": "Variant B", + "variantCName": "Variant C", + } + + iteration = iterations[-1] # Latest iteration + + for eval_dir in sorted(iteration.iterdir()): + if not eval_dir.is_dir(): + continue + name = eval_dir.name + a_data = build_variant_data(eval_dir / "variant-A") + b_data = build_variant_data(eval_dir / "variant-B") + c_data = build_variant_data(eval_dir / "variant-C") + + prompt = evals_meta.get(name, {}).get("prompt", "") + + # Load blind comparisons if available + blind = ( + load_json_safe(eval_dir / "blind_comparison.json") + if (eval_dir / "blind_comparison.json").exists() + else None + ) + blind_bc = ( + load_json_safe(eval_dir / "blind_comparison_bc.json") + if (eval_dir / "blind_comparison_bc.json").exists() + else None + ) + + eval_entry = { + "name": name, + "prompt": prompt, + "variantA": a_data, + "variantB": b_data, + "blind_comparison": blind, + "blind_comparison_bc": blind_bc, + } + if c_data: + eval_entry["variantC"] = c_data + evals_data.append(eval_entry) + + a_rate = a_data.get("grading", {}).get("summary", {}).get("pass_rate", 0) if a_data.get("grading") else 0 + b_rate = b_data.get("grading", {}).get("summary", {}).get("pass_rate", 0) if b_data.get("grading") else 0 + c_rate = c_data.get("grading", {}).get("summary", {}).get("pass_rate", 0) if c_data.get("grading") else 0 + bm = {"name": name, "aRate": a_rate, "bRate": b_rate} + if c_data: + bm["cRate"] = c_rate + benchmark.append(bm) + + variants = evals_raw.get("variants", {}) if evals_raw else {} + + return { + "evals": evals_data, + "benchmark": benchmark, + "variantAName": variants.get("A", {}).get("name", "Variant A"), + "variantBName": variants.get("B", {}).get("name", "Variant B"), + "variantCName": variants.get("C", {}).get("name", "Variant C"), + } + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + + workspace = args.workspace.resolve() + template = workspace / "compare.html" + output = (args.output or workspace / "compare_report.html").resolve() + + if not template.exists(): + print(f"Error: {template} not found", file=sys.stderr) + return 1 + + data = build_data(workspace) + html = read_text_safe(template).replace("__DATA_PLACEHOLDER__", json.dumps(data, indent=2)) + output.write_text(html, encoding="utf-8") + + print(f"Report: {output}") + print(f"Evals: {len(data['evals'])}") + for ev in data["evals"]: + a = ev.get("variantA", {}) + b = ev.get("variantB", {}) + print( + f" {ev['name']}: A={a.get('lines', 0)}L/{a.get('compiles', '?')} B={b.get('lines', 0)}L/{b.get('compiles', '?')}" + ) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/skill-creator/scripts/optimize_description.py b/skills/skill-creator/scripts/optimize_description.py new file mode 100644 index 0000000..ae36723 --- /dev/null +++ b/skills/skill-creator/scripts/optimize_description.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python3 +""" +optimize_description.py — Train/test description optimization for skill triggering accuracy. + +Splits eval queries 60/40 train/test. Evaluates the current description (3 runs per query +for variance reduction). Proposes improvements based on train set failures. Re-evaluates +on both sets. Selects best description by test score to prevent overfitting. + +Eval set format (trigger-eval.json): + [ + {"query": "user prompt text", "should_trigger": true}, + {"query": "adjacent domain prompt", "should_trigger": false} + ] +""" + +import argparse +import json +import math +import random +import shutil +import subprocess +import sys +import tempfile +from datetime import datetime, timezone +from pathlib import Path + +RUNS_PER_QUERY = 3 # Runs per query for variance reduction + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + description="Optimize skill description for triggering accuracy", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + p.add_argument("--skill-path", required=True, help="Path to the skill directory (contains SKILL.md)") + p.add_argument("--eval-set", required=True, help="Path to trigger-eval.json") + p.add_argument("--model", default="claude-sonnet-4-6", help="Claude model to use (default: claude-sonnet-4-6)") + p.add_argument("--max-iterations", type=int, default=5, help="Maximum optimization iterations (default: 5)") + p.add_argument("--seed", type=int, default=42, help="Random seed for train/test split (default: 42)") + p.add_argument("--dry-run", action="store_true", help="Show split and current accuracy without optimizing") + return p + + +def check_claude_available() -> None: + if shutil.which("claude") is None: + print( + "ERROR: 'claude' CLI not found in PATH.\nInstall with: npm install -g @anthropic-ai/claude-code", + file=sys.stderr, + ) + sys.exit(1) + + +def load_eval_set(eval_path: Path) -> list[dict]: + try: + data = json.loads(eval_path.read_text()) + except (json.JSONDecodeError, OSError) as e: + print(f"ERROR: Could not load eval set {eval_path}: {e}", file=sys.stderr) + sys.exit(1) + + if not isinstance(data, list) or not data: + print("ERROR: eval set must be a non-empty JSON array", file=sys.stderr) + sys.exit(1) + + for entry in data: + if "query" not in entry or "should_trigger" not in entry: + print( + f"ERROR: each eval entry must have 'query' and 'should_trigger' fields. Got: {entry}", + file=sys.stderr, + ) + sys.exit(1) + + return data + + +def split_eval_set(eval_set: list[dict], seed: int) -> tuple[list[dict], list[dict]]: + """60/40 train/test split, stratified by should_trigger.""" + rng = random.Random(seed) + should_trigger = [e for e in eval_set if e["should_trigger"]] + should_not = [e for e in eval_set if not e["should_trigger"]] + + def split(items: list) -> tuple[list, list]: + shuffled = items[:] + rng.shuffle(shuffled) + split_point = math.ceil(len(shuffled) * 0.6) + return shuffled[:split_point], shuffled[split_point:] + + train_trigger, test_trigger = split(should_trigger) + train_no, test_no = split(should_not) + return train_trigger + train_no, test_trigger + test_no + + +def test_trigger(query: str, description: str, model: str) -> bool: + """ + Ask claude whether it would use the skill given this description and query. + Returns True if the skill should trigger, False otherwise. + """ + prompt = ( + f"You are a routing system. A skill has this description:\n\n" + f"---\n{description}\n---\n\n" + f'A user says: "{query}"\n\n' + f"Answer with exactly one word: YES if you would use this skill for this request, " + f"NO if you would not. Do not explain." + ) + + try: + result = subprocess.run( + ["claude", "-p", prompt, "--model", model], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode != 0: + print( + f"WARNING: claude exited {result.returncode}: {result.stderr[:200]}", + file=sys.stderr, + ) + return False + answer = result.stdout.strip().upper() + return answer.startswith("YES") + except subprocess.TimeoutExpired: + return False + + +def evaluate_description(description: str, eval_queries: list[dict], model: str, runs: int = RUNS_PER_QUERY) -> float: + """Evaluate a description against a set of queries. Returns accuracy (0.0-1.0).""" + if not eval_queries: + return 0.0 + + correct = 0 + total = 0 + + for entry in eval_queries: + query = entry["query"] + should_trigger = entry["should_trigger"] + + # Run multiple times for variance reduction; take majority vote + votes = [test_trigger(query, description, model) for _ in range(runs)] + majority_triggered = votes.count(True) > runs / 2 + + if majority_triggered == should_trigger: + correct += 1 + total += 1 + + return correct / total if total > 0 else 0.0 + + +def propose_improvement( + description: str, + train_queries: list[dict], + failures: list[dict], + model: str, +) -> str: + """ + Ask claude to propose a better description based on train set failures. + Returns the proposed description text. + """ + failure_examples = "\n".join( + f'- Query: "{f["query"]}" | Expected: {"TRIGGER" if f["should_trigger"] else "NO TRIGGER"} | Got: {"TRIGGER" if f["triggered"] else "NO TRIGGER"}' + for f in failures[:10] # Cap at 10 examples to avoid prompt bloat + ) + + prompt = ( + f"You are improving a Claude skill's description to optimize triggering accuracy.\n\n" + f"Current description:\n---\n{description}\n---\n\n" + f"Failures on training set:\n{failure_examples}\n\n" + f"Requirements:\n" + f"1. Keep the description under 1024 characters\n" + f"2. No XML angle brackets (< or >)\n" + f"3. Maintain the What+When formula: 'Do X when Y. Use for [triggers]. Do NOT use for [anti-triggers].'\n" + f"4. Do not overfit to the failure examples — improve the description generally\n" + f"5. Return ONLY the new description text, no explanation\n\n" + f"New description:" + ) + + try: + result = subprocess.run( + ["claude", "-p", prompt, "--model", model], + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode != 0: + print( + f"WARNING: claude exited {result.returncode} proposing improvement: {result.stderr[:200]}", + file=sys.stderr, + ) + return description + proposed = result.stdout.strip() + if not proposed: + print("WARNING: claude returned empty improvement. Keeping current.", file=sys.stderr) + return description + if len(proposed) > 1024: + print(f"WARNING: Proposed description exceeds 1024 chars ({len(proposed)}). Truncating.", file=sys.stderr) + proposed = proposed[:1020] + "..." + return proposed + except subprocess.TimeoutExpired: + print("WARNING: Timeout proposing description improvement. Keeping current.", file=sys.stderr) + return description + + +def identify_failures(description: str, queries: list[dict], model: str) -> list[dict]: + """Return list of queries where the description produced incorrect routing.""" + failures = [] + for entry in queries: + query = entry["query"] + should_trigger = entry["should_trigger"] + votes = [test_trigger(query, description, model) for _ in range(RUNS_PER_QUERY)] + triggered = votes.count(True) > RUNS_PER_QUERY / 2 + if triggered != should_trigger: + failures.append({**entry, "triggered": triggered}) + return failures + + +def optimize(args: argparse.Namespace) -> int: + check_claude_available() + + skill_path = Path(args.skill_path).resolve() + skill_md = skill_path / "SKILL.md" + eval_path = Path(args.eval_set).resolve() + + if not skill_md.exists(): + print(f"ERROR: SKILL.md not found at {skill_md}", file=sys.stderr) + return 1 + + eval_set = load_eval_set(eval_path) + train_set, test_set = split_eval_set(eval_set, seed=args.seed) + + print(f"Eval set: {len(eval_set)} queries ({len(train_set)} train, {len(test_set)} test)", file=sys.stderr) + + # Extract current description from SKILL.md frontmatter + skill_text = skill_md.read_text() + description_start = skill_text.find("description: |") + if description_start == -1: + print("ERROR: Could not find 'description: |' in SKILL.md frontmatter", file=sys.stderr) + return 1 + + # Extract description block (lines until next YAML key) + lines = skill_text.split("\n") + desc_lines = [] + in_desc = False + for line in lines: + if line.strip().startswith("description: |"): + in_desc = True + continue + if in_desc: + if line and not line[0].isspace() and ":" in line: + break + desc_lines.append(line.lstrip()) + + current_description = "\n".join(desc_lines).strip() + print(f"Current description ({len(current_description)} chars)", file=sys.stderr) + + if args.dry_run: + train_acc = evaluate_description(current_description, train_set, args.model) + test_acc = evaluate_description(current_description, test_set, args.model) + print(f"Train accuracy: {train_acc:.1%}") + print(f"Test accuracy: {test_acc:.1%}") + return 0 + + # Evaluate initial accuracy + print("Evaluating initial description...", file=sys.stderr) + best_description = current_description + best_test_acc = evaluate_description(current_description, test_set, args.model) + print(f"Initial test accuracy: {best_test_acc:.1%}", file=sys.stderr) + + history = [{"iteration": 0, "description": current_description, "test_accuracy": best_test_acc}] + + for iteration in range(1, args.max_iterations + 1): + print(f"\nIteration {iteration}/{args.max_iterations}", file=sys.stderr) + + failures = identify_failures(best_description, train_set, args.model) + train_acc = 1.0 - (len(failures) / len(train_set)) if train_set else 0.0 + print(f"Train accuracy: {train_acc:.1%} ({len(failures)} failures)", file=sys.stderr) + + if not failures: + print("No failures on train set. Optimization complete.", file=sys.stderr) + break + + proposed = propose_improvement(best_description, train_set, failures, args.model) + proposed_test_acc = evaluate_description(proposed, test_set, args.model) + print(f"Proposed test accuracy: {proposed_test_acc:.1%}", file=sys.stderr) + + history.append( + { + "iteration": iteration, + "description": proposed, + "train_accuracy": train_acc, + "test_accuracy": proposed_test_acc, + } + ) + + if proposed_test_acc >= best_test_acc: + best_description = proposed + best_test_acc = proposed_test_acc + print(f"Accepted (test accuracy improved or held: {best_test_acc:.1%})", file=sys.stderr) + else: + print(f"Rejected (test accuracy decreased: {proposed_test_acc:.1%} < {best_test_acc:.1%})", file=sys.stderr) + + # Report results + print(f"\n=== Optimization Complete ===") + print(f"Best test accuracy: {best_test_acc:.1%}") + print(f"Iterations run: {len(history) - 1}") + + if best_description != current_description: + print(f"\nBest description ({len(best_description)} chars):\n") + print(best_description) + else: + print("\nNo improvement found. Current description is already optimal.") + + # Write history to optimization_history.json alongside the eval set + history_path = eval_path.parent / "optimization_history.json" + history_path.write_text( + json.dumps( + { + "skill_path": str(skill_path), + "eval_set": str(eval_path), + "model": args.model, + "timestamp": datetime.now(timezone.utc).isoformat(), + "best_test_accuracy": best_test_acc, + "best_description": best_description, + "history": history, + }, + indent=2, + ) + ) + print(f"\nHistory written: {history_path}", file=sys.stderr) + + return 0 + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + return optimize(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/skill-creator/scripts/package_results.py b/skills/skill-creator/scripts/package_results.py new file mode 100644 index 0000000..07ce725 --- /dev/null +++ b/skills/skill-creator/scripts/package_results.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +""" +package_results.py — Consolidate all iteration artifacts into a summary report. + +Reads grading.json, benchmark.json, analysis.json, and changes.md from each iteration +directory in the workspace. Produces a single summary report. + +Usage: + python3 package_results.py workspace/ --format markdown + python3 package_results.py workspace/ --format json +""" + +import argparse +import json +import sys +from datetime import datetime, timezone +from pathlib import Path + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + description="Consolidate eval iteration artifacts into a summary report", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + p.add_argument("workspace", help="Path to skill-workspace/ root directory") + p.add_argument( + "--format", choices=["markdown", "json"], default="markdown", help="Output format (default: markdown)" + ) + p.add_argument("--output", help="Output file path (default: workspace/summary.md or summary.json)") + return p + + +def find_iteration_dirs(workspace: Path) -> list[Path]: + """Find all iteration-N directories in the workspace.""" + iterations = [] + for child in sorted(workspace.iterdir()): + if child.is_dir() and child.name.startswith("iteration-"): + try: + int(child.name.split("-")[1]) + iterations.append(child) + except (IndexError, ValueError): + pass + return sorted(iterations, key=lambda p: int(p.name.split("-")[1])) + + +def load_json_safe(path: Path) -> dict | list | None: + if not path.exists(): + return None + try: + return json.loads(path.read_text()) + except (json.JSONDecodeError, OSError): + return None + + +def load_text_safe(path: Path) -> str | None: + if not path.exists(): + return None + try: + return path.read_text() + except OSError: + return None + + +def collect_iteration_data(iteration_dir: Path) -> dict: + """Collect all artifacts from a single iteration directory.""" + data = { + "iteration": iteration_dir.name, + "benchmark": load_json_safe(iteration_dir / "benchmark.json"), + "analysis": load_json_safe(iteration_dir / "analysis.json"), + "changes": load_text_safe(iteration_dir / "changes.md"), + "evals": [], + } + + # Collect per-eval data + for child in sorted(iteration_dir.iterdir()): + if child.is_dir(): + grading = load_json_safe(child / "grading.json") + if grading: + data["evals"].append( + { + "eval_id": child.name, + "grading": grading, + } + ) + + return data + + +def render_markdown(workspace: Path, iterations: list[dict]) -> str: + lines = [ + "# Skill Eval Summary\n", + f"**Workspace**: `{workspace}` \n", + f"**Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')} \n", + f"**Iterations**: {len(iterations)}\n\n", + ] + + # Progress table across iterations + if any(it["benchmark"] for it in iterations): + lines.append("## Pass Rate Progression\n\n") + lines.append("| Iteration | With Skill | Without Skill | Delta |\n") + lines.append("|-----------|-----------|---------------|-------|\n") + + for it in iterations: + b = it["benchmark"] + if b: + ws = b.get("with_skill", {}).get("pass_rate", {}).get("mean", 0) + wos = b.get("without_skill", {}).get("pass_rate", {}).get("mean", 0) + delta = b.get("delta", {}).get("pass_rate") + delta_str = ( + f"+{delta:.1%}" + if delta is not None and delta > 0 + else (f"{delta:.1%}" if delta is not None else "N/A") + ) + lines.append(f"| {it['iteration']} | {ws:.1%} | {wos:.1%} | {delta_str} |\n") + else: + lines.append(f"| {it['iteration']} | — | — | — |\n") + + lines.append("\n") + + # Per-iteration sections + for it in iterations: + lines.append(f"## {it['iteration'].replace('-', ' ').title()}\n\n") + + # Changes summary + if it["changes"]: + lines.append("### Changes Made\n\n") + # Include first 50 lines of changes.md + change_lines = it["changes"].split("\n")[:50] + lines.append("\n".join(change_lines)) + if len(it["changes"].split("\n")) > 50: + lines.append("\n_(truncated — see changes.md for full content)_") + lines.append("\n\n") + + # Eval results + if it["evals"]: + lines.append("### Eval Results\n\n") + lines.append("| Eval | Pass Rate | Pass | Fail |\n") + lines.append("|------|-----------|------|------|\n") + for ev in it["evals"]: + g = ev["grading"] + lines.append( + f"| {ev['eval_id']} | {g.get('pass_rate', 0):.1%} | {g.get('pass_count', 0)} | {g.get('fail_count', 0)} |\n" + ) + lines.append("\n") + + # Top findings from analysis + if it["analysis"]: + findings = it["analysis"].get("findings", []) + high_priority = [f for f in findings if f.get("priority") == "high"] + if high_priority: + lines.append("### High-Priority Findings\n\n") + for f in high_priority[:5]: + lines.append(f"- **{f.get('category', 'finding')}**: {f.get('finding', '')}\n") + if f.get("actionable_suggestion"): + lines.append(f" - Suggestion: {f['actionable_suggestion']}\n") + lines.append("\n") + + # Final recommendation + if iterations: + last = iterations[-1] + b = last.get("benchmark") + if b: + delta = b.get("delta", {}).get("pass_rate") + if delta is not None: + lines.append("## Final Assessment\n\n") + if delta > 0.05: + lines.append(f"The skill demonstrates measurable improvement: pass rate delta = +{delta:.1%}\n") + elif delta < -0.05: + lines.append(f"The skill performs below baseline: pass rate delta = {delta:.1%}\n") + lines.append( + "Consider reviewing skill instructions — they may be adding noise rather than signal.\n" + ) + else: + lines.append(f"The skill shows marginal impact: pass rate delta = {delta:.1%}\n") + lines.append("Check whether eval assertions are discriminating (test skill-specific behavior).\n") + + return "".join(lines) + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + workspace = Path(args.workspace).resolve() + + if not workspace.exists(): + print(f"ERROR: Workspace does not exist: {workspace}", file=sys.stderr) + return 1 + + iteration_dirs = find_iteration_dirs(workspace) + if not iteration_dirs: + print(f"WARNING: No iteration directories found in {workspace}", file=sys.stderr) + + iterations = [collect_iteration_data(d) for d in iteration_dirs] + + if args.format == "markdown": + content = render_markdown(workspace, iterations) + default_name = "summary.md" + else: + content = json.dumps( + { + "workspace": str(workspace), + "generated": datetime.now(timezone.utc).isoformat(), + "iterations": iterations, + }, + indent=2, + ) + default_name = "summary.json" + + output_path = Path(args.output).resolve() if args.output else (workspace / default_name) + output_path.write_text(content) + print(f"Written: {output_path}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/skill-creator/scripts/run_eval.py b/skills/skill-creator/scripts/run_eval.py new file mode 100644 index 0000000..d83ce2a --- /dev/null +++ b/skills/skill-creator/scripts/run_eval.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +""" +run_eval.py — Execute a skill against a test prompt via claude -p subprocess. + +Produces in --output-dir: + outputs/ All files written during the run + transcript.md Full execution log + timing.json Token count and wall-clock duration + metrics.json Tool usage counts +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +import tempfile +import time +from pathlib import Path + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + description="Execute a skill against a test prompt via claude -p", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + p.add_argument("--skill-path", required=True, help="Path to the skill directory (contains SKILL.md)") + p.add_argument("--prompt", required=True, help="Test prompt text to run") + p.add_argument("--output-dir", required=True, help="Directory to write outputs, transcript, timing, metrics") + p.add_argument("--model", default="claude-sonnet-4-6", help="Claude model to use (default: claude-sonnet-4-6)") + p.add_argument("--no-skill", action="store_true", help="Run without loading the skill (baseline run)") + p.add_argument("--timeout", type=int, default=300, help="Max seconds to wait for claude -p (default: 300)") + return p + + +def check_claude_available() -> None: + """Verify claude CLI is in PATH. Exit 1 with actionable message if not.""" + if shutil.which("claude") is None: + print( + "ERROR: 'claude' CLI not found in PATH.\n" + "Install with: npm install -g @anthropic-ai/claude-code\n" + "Verify with: which claude && claude --version", + file=sys.stderr, + ) + sys.exit(1) + + +def prepare_output_dir(output_dir: Path) -> Path: + """Create output directory structure. Returns outputs/ subdirectory.""" + output_dir.mkdir(parents=True, exist_ok=True) + outputs = output_dir / "outputs" + outputs.mkdir(exist_ok=True) + return outputs + + +def build_claude_command( + skill_path: Path, + prompt: str, + outputs_dir: Path, + model: str, + no_skill: bool, +) -> list[str]: + """Construct the claude -p command with appropriate flags.""" + cmd = [ + "claude", + "-p", + prompt, + "--model", + model, + "--output-format", + "json", + ] + + if not no_skill: + skill_md = skill_path / "SKILL.md" + if not skill_md.exists(): + print(f"ERROR: SKILL.md not found at {skill_md}", file=sys.stderr) + sys.exit(1) + cmd.extend(["--system-prompt-file", str(skill_md)]) + + # Ask claude to write outputs to the outputs directory + cmd.extend( + [ + "--working-dir", + str(outputs_dir), + ] + ) + + return cmd + + +def count_tools(transcript_text: str) -> dict: + """Count tool invocations by type from transcript text.""" + import re + + tool_pattern = re.compile(r'"tool":\s*"([^"]+)"') + counts: dict[str, int] = {} + for match in tool_pattern.finditer(transcript_text): + tool = match.group(1) + counts[tool] = counts.get(tool, 0) + 1 + return counts + + +def run_eval(args: argparse.Namespace) -> int: + check_claude_available() + + skill_path = Path(args.skill_path).resolve() + output_dir = Path(args.output_dir).resolve() + outputs_dir = prepare_output_dir(output_dir) + + cmd = build_claude_command( + skill_path=skill_path, + prompt=args.prompt, + outputs_dir=outputs_dir, + model=args.model, + no_skill=args.no_skill, + ) + + print(f"Running: {' '.join(cmd[:4])} ...", file=sys.stderr) + start_time = time.monotonic() + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=args.timeout, + cwd=str(outputs_dir), + ) + except subprocess.TimeoutExpired: + print(f"ERROR: claude -p timed out after {args.timeout}s", file=sys.stderr) + (output_dir / "transcript.md").write_text( + f"# Execution Timeout\n\nRun timed out after {args.timeout} seconds.\n" + ) + _write_timing(output_dir, duration=float(args.timeout), tokens=0, timed_out=True) + _write_metrics(output_dir, tool_counts={}) + return 1 + + duration = time.monotonic() - start_time + + # Write transcript + transcript_lines = [ + "# Execution Transcript\n", + f"**Model**: {args.model}\n", + f"**Skill loaded**: {not args.no_skill}\n", + f"**Duration**: {duration:.2f}s\n", + f"**Exit code**: {result.returncode}\n\n", + "## stdout\n\n```\n", + result.stdout or "(empty)", + "\n```\n\n## stderr\n\n```\n", + result.stderr or "(empty)", + "\n```\n", + ] + transcript_text = "".join(transcript_lines) + (output_dir / "transcript.md").write_text(transcript_text) + + # Parse token counts from JSON output if available + tokens = 0 + try: + response = json.loads(result.stdout) + usage = response.get("usage", {}) + tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0) + except (json.JSONDecodeError, AttributeError): + pass + + _write_timing(output_dir, duration=duration, tokens=tokens, timed_out=False) + _write_metrics(output_dir, tool_counts=count_tools(result.stdout + result.stderr)) + + if result.returncode != 0: + print( + f"WARNING: claude -p exited with code {result.returncode}. Check transcript.md for details.", + file=sys.stderr, + ) + return result.returncode + + print(f"Eval complete. Outputs: {output_dir}", file=sys.stderr) + return 0 + + +def _write_timing(output_dir: Path, duration: float, tokens: int, timed_out: bool) -> None: + timing = { + "duration_seconds": round(duration, 3), + "tokens_total": tokens, + "timed_out": timed_out, + } + (output_dir / "timing.json").write_text(json.dumps(timing, indent=2)) + + +def _write_metrics(output_dir: Path, tool_counts: dict) -> None: + metrics = { + "tool_usage": tool_counts, + "total_tool_calls": sum(tool_counts.values()), + } + (output_dir / "metrics.json").write_text(json.dumps(metrics, indent=2)) + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + return run_eval(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/skill-eval/SKILL.md b/skills/skill-eval/SKILL.md index 8bd7ac4..26cbfcf 100644 --- a/skills/skill-eval/SKILL.md +++ b/skills/skill-eval/SKILL.md @@ -7,11 +7,11 @@ description: | with A/B comparisons, and validate skill structure. Use when user says "improve skill", "test skill triggers", "optimize description", "benchmark skill", "eval skill", or "skill quality". Do NOT use for creating new skills - (use skill-creator-engineer). + (use skill-creator). version: 1.0.0 user-invocable: false argument-hint: "" -agent: skill-creator-engineer +agent: skill-creator allowed-tools: - Read - Write @@ -66,7 +66,7 @@ This skill operates as the eval-driven improvement pipeline for Claude Code skil - Generate HTML reports for visual review ## What This Skill CANNOT Do -- Create new skills from scratch (use skill-creator-engineer) +- Create new skills from scratch (use skill-creator) - Modify skill instructions automatically (human reviews changes) - Test skills that require specific MCP servers or external services - Run evals without the `claude` CLI available diff --git a/skills/workflow-help/SKILL.md b/skills/workflow-help/SKILL.md index 9c6d00a..50729a5 100644 --- a/skills/workflow-help/SKILL.md +++ b/skills/workflow-help/SKILL.md @@ -62,7 +62,7 @@ This skill operates as an operator for workflow education and guidance, configur ## What This Skill CANNOT Do - Execute workflows (use workflow-orchestrator) - Debug code (use systematic-debugging) -- Create or modify skills (use skill-creator-engineer) +- Create or modify skills (use skill-creator) - Run tests or validate code (use verification-before-completion) - Make decisions about which approach to take for the user's actual task