diff --git a/.env.example b/.env.example index ed57670..c357316 100644 --- a/.env.example +++ b/.env.example @@ -9,3 +9,12 @@ GEMINI_API_KEY= # Max iterations before stopping (default: 100) # OODA_MAX_ITERATIONS=100 + +# ── Executive Harness ──────────────────────────────────────────────────── +# Coding agent CLI for spawning autoresearch instances +# Options: 'claude-code', 'codex', or null (manual mode) +# AGENT_COMMAND=claude-code + +# Autoresearch Hub URL (for agenthub multi-agent mode) +# AUTORESEARCH_HUB_URL=https://autoresearchhub.com +# AUTORESEARCH_API_KEY= diff --git a/.gitignore b/.gitignore index 4e6b61a..1a3bf4b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,10 @@ node_modules/ .env _ooda/ +.executive/ *.db *.db-journal .DS_Store +autoresearch/ +results.tsv +run.log diff --git a/README.md b/README.md index b17c55c..d50528f 100644 --- a/README.md +++ b/README.md @@ -1,205 +1,211 @@ -# ooda-engine +# ooda-engine v2 — Executive Harness for Autoresearch -Self-improving AI engine that makes your codebase better, autonomously. +Higher-order orchestration layer built on [Karpathy's autoresearch](https://github.com/karpathy/autoresearch). Translates business goals into parallel autoresearch instances, monitors their progress, cross-pollinates insights, and makes portfolio-level decisions. -Point it at any code that produces **scorable output** — websites, emails, API responses, LLM prompts, data pipelines — and it will iteratively improve the source code through multi-agent critique, voting, and verified edits. +Where autoresearch optimizes **one file** against **one metric** with **one agent**, the executive orchestrates **many instances** against **many goals** with **strategic oversight**. + +``` +┌─────────────────────────────────────────────────────┐ +│ OODA EXECUTIVE (this repo) │ +│ Goals → Decompose → Spawn → Monitor → Decide → Act │ +├─────────────┬───────────────┬───────────────────────┤ +│ Instance A │ Instance B │ Instance C │ +│ autoresearch│ autoresearch │ autoresearch │ +│ train.py │ train.py │ config.ts │ +│ ↕ val_bpb │ ↕ peak_vram │ ↕ p95_latency │ +│ branch: A │ branch: B │ branch: C │ +└─────────────┴───────────────┴───────────────────────┘ +``` ## How it works -Each iteration runs the full OODA loop: +### The autoresearch protocol -``` -BUILD → OBSERVE → ORIENT → DECIDE → ACT → VERIFY -``` +Each instance follows Karpathy's autoresearch protocol: +1. Read `program.md` (research instructions) +2. Edit the target file (`train.py` or any file) +3. `git commit` +4. Run experiment, measure metric +5. If improved → keep. If not → `git reset --hard HEAD~1` +6. Repeat forever -1. **Build** — Run your build command to produce artifacts -2. **Observe** — Score every artifact (your scoring function + frequency analysis) -3. **Orient** — 3 AI critic agents independently analyze systemic patterns -4. **Decide** — Strategist agent synthesizes all critiques, picks top 3 source code edits -5. **Act** — Builder applies edits with safety checks (line-based, max 30 lines, backup) -6. **Verify** — Rebuild everything, re-score. If score drops → automatic rollback +### What the executive adds -The loop stops when it hits a **plateau** (no improvement for N iterations) or reaches the max iteration count. +The executive is the **control plane** that autoresearch doesn't have: -### What makes it different +1. **Mission decomposition** — LLM translates business goals into parallel research tracks, each becoming an autoresearch instance +2. **Program.md generation** — The executive writes each instance's `program.md`, injecting cross-instance insights, learned patterns, and strategic focus +3. **Portfolio monitoring** — Polls `results.tsv` from every instance, tracks progress toward business KPIs +4. **Strategic rebalancing** — Kills underperformers, redirects stalled tracks, spawns new approaches. Uses the OODA cycle (Observe → Orient → Decide → Act) at the portfolio level +5. **Cross-pollination** — Insights from one instance get injected into other instances' `program.md` files +6. **Escalation** — When all instances plateau, re-decomposes goals from scratch with entirely fresh approaches +7. **Skill memory** — Patterns learned across all instances persist in SQLite, improving future runs -- **It edits your actual source code**, not just configs or parameters -- **Multi-agent critique** — 3 independent critics prevent tunnel vision -- **Automatic rollback** — regressions are caught and reverted immediately -- **Skill memory** — learned patterns persist across runs via SQLite -- **Plateau detection** — stops when it can't improve further (no wasted API calls) -- **Deterministic + AI scoring** — fast deterministic checks catch concrete signals, AI catches subjective quality +### The OODA executive cycle + +``` +OBSERVE → Read results.tsv from all instances, compute goal progress +ORIENT → LLM analyzes portfolio: who's performing, who's stalled, what opportunities +DECIDE → LLM recommends actions: spawn, kill, redirect, combine, escalate, hold +ACT → Execute decisions via instance manager +``` + +The executive runs on a slower cadence (every 15min by default) while individual instances experiment every ~5 minutes. ## Quick start ```bash npm install ooda-engine -# or clone this repo ``` -1. Create `ooda.config.js` in your project root: +1. Create `executive.config.js`: ```javascript module.exports = { - name: 'My Project', - - // Files the AI can read and modify - engineFiles: { - 'src/templates.js': { path: 'src/templates.js', role: 'Page renderer' }, - 'src/prompts.js': { path: 'src/prompts.js', role: 'LLM prompts' }, - }, - - // Artifacts to build and score each iteration - testArtifacts: [ - { id: 'test-1', input: 'hello world' }, - { id: 'test-2', input: 'complex query' }, + name: 'My Optimization Portfolio', + + goals: [ + { + id: 'model-quality', + description: 'Improve model quality', + metric: 'val_bpb', + current: 0.998, + target: 0.950, + weight: 0.5, + direction: 'lower', + }, + { + id: 'memory', + description: 'Reduce GPU memory', + metric: 'peak_vram_gb', + current: 12.0, + target: 8.0, + weight: 0.3, + direction: 'lower', + }, + ], + + // Optional: pre-define tracks. Omit to let the LLM decompose goals. + tracks: [ + { + id: 'arch-search', + name: 'Architecture Search', + description: 'Find optimal depth/width tradeoff', + approach: 'Vary num_layers and d_model systematically', + metric: 'val_bpb', + metricDirection: 'lower', + relatedGoals: ['model-quality'], + priority: 'critical', + }, ], - // Build function — produce output from your code - buildCommand(artifact) { - const { render } = require('./src/templates'); - return render(artifact.input); - }, - - // Score function — rate the output 1-10 - scoreArtifact(output, artifact) { - let score = 5; - if (output.length > 1000) score += 2; - if (output.includes('error')) score -= 3; - return { score: Math.max(1, Math.min(10, score)), signals: [], issues: [] }; - }, + agentCommand: 'claude-code', // or 'codex', or null for manual + maxConcurrentInstances: 4, }; ``` -2. Set your AI provider: +2. Run: ```bash -cp .env.example .env -# Edit .env — add GEMINI_API_KEY or OPENAI_API_KEY -``` +# Plan without spawning (see what tracks get generated) +node executive.js --dry-run -3. Run: +# Full run — spawns autoresearch instances and manages them +node executive.js -```bash -node ooda.js # Full run (100 iterations) -node ooda.js --max=10 # Quick test -node ooda.js --dry-run # See what it would change without modifying files -node ooda.js --status # View progress from a previous run -node ooda.js --reset # Start fresh (archives old state) -``` +# Specify coding agent +node executive.js --agent=claude-code -## Use cases +# Custom review interval +node executive.js --interval=10 -| Domain | Build function | Score function | -|--------|---------------|----------------| -| **Website templates** | Render HTML from templates | Score content completeness, SEO, accessibility | -| **LLM prompts** | Run prompt, collect output | Score accuracy, relevance, format compliance | -| **Email generators** | Generate email HTML | Score deliverability signals, CTA quality | -| **API formatters** | Format sample responses | Score schema compliance, completeness | -| **Code generators** | Generate code from specs | Score compilation, test pass rate | -| **Data pipelines** | Process sample datasets | Score output accuracy, coverage | -| **Doc generators** | Generate docs from code | Score coverage, readability, correctness | +# Check portfolio status +node executive.js --status +``` ## Architecture ``` -ooda.js Main loop orchestrator +executive.js CLI entry point for v2 executive harness +ooda.js v1 OODA loop (still works standalone) lib/ - ai.js Unified AI client (Gemini / OpenAI) - db.js SQLite skill storage - skill-memory.js Learning agent — records patterns, injects into prompts - reviewer.js Multi-agent QA panel (usable standalone) + autoresearch.js Adapter for Karpathy's autoresearch + - Workspace setup (clone, branch, prepare) + - program.md generation (the control surface) + - results.tsv parsing + - Plateau detection + - Hub API client (for agenthub mode) + executive-loop.js Meta-OODA loop over autoresearch instances + instance-manager.js Spawn/monitor/steer/kill instances + hub.js Central coordination (file-based + hub API) + mission.js Goal decomposition via LLM + ai.js Unified AI client (Gemini / OpenAI) + db.js SQLite skill storage + skill-memory.js Cross-instance learning persistence + reviewer.js Multi-agent QA panel (usable standalone) examples/ - ooda.config.js Example configuration - reviewer-example.js Standalone reviewer usage + executive.config.js Example: ML optimization portfolio + executive.generalized.config.js Example: Full-stack product optimization + ooda.config.js Example: v1 single-loop config ``` -### The 6 agents +## Control surfaces -| # | Agent | Role | -|---|-------|------| -| 1 | **Observer** | Scores every artifact using your scoring function | -| 2 | **Critic A** | Domain-specific code analysis (customizable) | -| 3 | **Critic B** | Prompt/generation quality analysis (customizable) | -| 4 | **Critic C** | Cross-artifact pattern analysis (customizable) | -| 5 | **Strategist** | Synthesizes all critiques, picks top 3 code edits | -| 6 | **Builder** | Applies edits with safety checks + rollback | +The executive steers autoresearch through these mechanisms: -### Safety guarantees +| Surface | How | When | +|---------|-----|------| +| **program.md** | Generate/rewrite research instructions | On spawn, on redirect | +| **Git branches** | Each instance gets its own `autoresearch/` branch | On spawn | +| **#discussion channel** | Post strategic direction for agents to read | On redirect, on cross-pollination | +| **Kill signal** | SIGTERM the agent process | When an instance underperforms | +| **Hub API** | Push/pull commits, coordinate multi-machine runs | In agenthub mode | -- **Max 30 lines per edit** — prevents large, risky rewrites -- **Automatic rollback** on build failure or score regression > threshold -- **File backups** before every edit -- **Module compilation check** after edits (verifies `require()` works) -- **Plateau detection** — stops after N flat iterations (default: 6) -- **Dry run mode** — analyze without modifying anything +## Use cases unlocked by the executive layer -### Skill memory +### 1. Multi-objective ML research +Run autoresearch on the same model targeting different objectives simultaneously. One instance minimizes val_bpb, another minimizes VRAM, another maximizes throughput. The executive manages the tradeoffs and finds the Pareto frontier. -The engine learns from its iterations. High-confidence patterns are stored in SQLite and injected into future LLM prompts as "LEARNED PREFERENCES": +### 2. Full-stack product optimization +Not just ML — the autoresearch protocol works for any "modify file → run → measure" loop. One instance optimizes API latency (modify server config, run load test), another optimizes Lighthouse score (modify layout, run lighthouse), another optimizes conversion (modify copy, run A/B proxy). The executive coordinates all three toward a unified product quality score. -```javascript -const { applySkills, recordObservation } = require('./lib/skill-memory'); +### 3. Adversarial research +Red-team / blue-team autoresearch. One instance tries to find adversarial inputs that break the model. Another instance hardens the model against those inputs. The executive feeds red-team findings into the blue-team's program.md and vice versa. Converges toward a robust system. -// Record what works -recordObservation({ - domain: 'templates', - pattern: 'Short headlines (under 50 chars) score 20% higher', - recommendation: { action: 'prefer short headlines' }, -}); +### 4. Scientific hypothesis portfolio +Exploring a research question from multiple angles. Each instance pursues a different hypothesis (different algorithms, different problem framings, different data subsets). The executive tracks which hypotheses show promise, kills dead ends, and doubles down on the most productive threads. Like managing a research lab budget. -// Inject learned patterns into any prompt -const augmentedPrompt = applySkills(myPrompt, { domain: 'templates' }); -``` +### 5. Continuous production optimization +Run the executive perpetually against production metrics. Instances experiment in staging, the executive promotes proven improvements to production. When the landscape shifts (new traffic patterns, new data), the executive detects metric regression and spins up fresh tracks automatically. -### Standalone reviewer +### 6. Multi-codebase coordination +One instance optimizes the model, another the serving infrastructure, another the client SDK. The executive ensures compatibility: if the model instance changes output format, the serving instance's program.md gets updated with the new contract. Prevents the classic "works in isolation, breaks in integration" problem. -The multi-agent review panel can be used independently: +### 7. Market-responsive R&D +The executive monitors external signals (analytics APIs, competitor tracking, user feedback) alongside instance metrics. If the market shifts, the executive re-decomposes goals: kill the tracks that no longer matter, spawn new ones aligned with the new reality. -```javascript -const { createReviewPanel } = require('./lib/reviewer'); +### 8. Cost-optimized research +Different instances on different hardware tiers. The executive runs speculative/exploratory tracks on cheap GPUs and promotes promising directions to expensive hardware for verification. Portfolio management applied to compute spend. -const review = createReviewPanel([ - { - name: 'Quality Checker', - buildPrompt: (artifact) => 'You are a QA reviewer. Score 1-10. Return JSON: { "score": N, "issues": [] }', - }, -]); +### 9. Multi-agent knowledge synthesis +Each instance explores a different sub-domain of a problem. The executive reads their git histories and results, synthesizes cross-domain insights via LLM, and creates "combined" instances that merge the best ideas from multiple tracks. Emergent research directions that no single agent would discover. -const result = await review(myArtifact); -// { pass: true, scores: { 'Quality Checker': 8 }, feedback: [] } -``` +### 10. Automated ablation studies +The executive systematically decomposes a complex system into components, spawns one autoresearch instance per component to find its optimal configuration, then creates a final "integration" instance that combines all the per-component optima. Structured ablation at scale. -## Configuration reference +## v1 mode (single OODA loop) -```javascript -module.exports = { - // Required - name: 'string', // Project name (shown in logs/reports) - engineFiles: { ... }, // Files agents can read and modify - testArtifacts: [ ... ], // Array of artifacts to build/score - buildCommand: (artifact) => output, // Build function - scoreArtifact: (output, artifact) => score, // Scoring function - - // Optional - domainContext: 'string', // Describes your domain for the strategist - engineDir: 'path', // Root dir for engine files (default: cwd) - stateDir: 'path', // Where state is saved (default: _ooda/) - maxIterations: 100, // Or set OODA_MAX_ITERATIONS env var - plateauLimit: 6, // Stop after N flat iterations - rateLimitMs: 3500, // Delay between LLM calls - regressionThreshold: -0.3, // Rollback threshold - verifyModules: true, // require() check after edits - - // Customize critic agents - critics: { - A: { name, files, systemPrompt, buildPrompt }, - B: { name, files, systemPrompt, buildPrompt }, - C: { name, systemPrompt, buildPrompt }, - }, -}; +The original OODA engine still works exactly as before: + +```bash +node ooda.js # Run with ooda.config.js +node ooda.js --config=my.js # Custom config +node ooda.js --dry-run # Analyze without modifying +node ooda.js --status # View progress ``` +See `examples/ooda.config.js` for the v1 configuration format. + ## Environment variables | Variable | Default | Description | @@ -207,8 +213,11 @@ module.exports = { | `AI_PROVIDER` | `gemini` | `gemini` or `openai` | | `GEMINI_API_KEY` | — | Required if using Gemini | | `OPENAI_API_KEY` | — | Required if using OpenAI | -| `OODA_MAX_ITERATIONS` | `100` | Max iterations per run | -| `OODA_DB_PATH` | `./skills.db` | SQLite database path for skill memory | +| `OODA_MAX_ITERATIONS` | `100` | Max iterations (v1 mode) | +| `OODA_DB_PATH` | `./skills.db` | SQLite database for skill memory | +| `AGENT_COMMAND` | — | Coding agent CLI (e.g. `claude-code`) | +| `AUTORESEARCH_HUB_URL` | — | Hub URL for agenthub mode | +| `AUTORESEARCH_API_KEY` | — | Hub API key | ## License diff --git a/examples/executive.config.js b/examples/executive.config.js new file mode 100644 index 0000000..7d725f8 --- /dev/null +++ b/examples/executive.config.js @@ -0,0 +1,115 @@ +// executive.config.js — Example executive harness configuration +// +// This example shows how to orchestrate multiple autoresearch instances +// toward business goals. The executive decomposes goals into research +// tracks, spawns autoresearch agents, and manages the portfolio. + +module.exports = { + // ── Identity ────────────────────────────────────────────────────────── + name: 'LLM Training Optimization', + domainContext: 'Optimizing a small GPT model across multiple dimensions simultaneously', + + // ── Business goals ──────────────────────────────────────────────────── + // The executive translates these into research tracks. + // Each goal has a metric, current value, target, and weight. + goals: [ + { + id: 'model-quality', + description: 'Improve model quality (val_bpb)', + metric: 'val_bpb', + current: 0.998, + target: 0.950, + weight: 0.5, + direction: 'lower', + }, + { + id: 'memory-efficiency', + description: 'Reduce GPU memory usage', + metric: 'peak_vram_gb', + current: 12.0, + target: 8.0, + weight: 0.3, + direction: 'lower', + }, + { + id: 'training-speed', + description: 'Maximize tokens processed per second', + metric: 'mfu_percent', + current: 30, + target: 50, + weight: 0.2, + direction: 'higher', + }, + ], + + // ── Pre-defined research tracks (optional) ──────────────────────────── + // If omitted, the executive uses LLM to decompose goals into tracks. + // Pre-defining gives you explicit control over what gets researched. + tracks: [ + { + id: 'arch-depth', + name: 'Architecture Depth Exploration', + description: 'Find optimal model depth vs width tradeoff for val_bpb', + approach: 'Systematically vary num_layers and d_model in train.py. Try deeper-narrower and shallower-wider configurations.', + metric: 'val_bpb', + metricDirection: 'lower', + relatedGoals: ['model-quality'], + priority: 'critical', + // These are autoresearch-native: the agent modifies train.py + ml: true, + }, + { + id: 'optimizer-tuning', + name: 'Optimizer & LR Schedule', + description: 'Optimize learning rate schedule, warmup, and optimizer hyperparameters', + approach: 'Explore Muon vs AdamW settings, warmup schedules, weight decay on different parameter groups.', + metric: 'val_bpb', + metricDirection: 'lower', + relatedGoals: ['model-quality', 'training-speed'], + priority: 'high', + ml: true, + }, + { + id: 'memory-opts', + name: 'Memory Optimization', + description: 'Reduce peak VRAM without hurting val_bpb', + approach: 'Try gradient checkpointing, mixed precision tweaks, batch size reduction with gradient accumulation.', + metric: 'peak_vram_mb', + metricDirection: 'lower', + relatedGoals: ['memory-efficiency'], + priority: 'high', + ml: true, + }, + { + id: 'throughput', + name: 'Training Throughput', + description: 'Maximize MFU (model FLOPs utilization)', + approach: 'Optimize data loading, compile settings, kernel fusion, batch sizing for hardware utilization.', + metric: 'mfu_percent', + metricDirection: 'higher', + relatedGoals: ['training-speed'], + priority: 'medium', + ml: true, + }, + ], + + // ── Coding agent command ────────────────────────────────────────────── + // Which CLI to use for spawning autoresearch agents. + // Each instance gets its own program.md and git branch. + // + // Options: + // 'claude-code' — Anthropic's Claude Code CLI + // 'codex' — OpenAI's Codex CLI + // null — Manual mode (prints instructions, you run agents yourself) + agentCommand: null, // Set to 'claude-code' or 'codex' for automatic agent spawning + + // ── Executive settings ──────────────────────────────────────────────── + maxConcurrentInstances: 4, // How many autoresearch agents run simultaneously + intervalMinutes: 15, // How often the executive reviews the portfolio + maxCycles: 50, // Max executive review cycles + experimentBudgetMinutes: 5, // Per-experiment time budget (autoresearch default) + + // ── Paths ───────────────────────────────────────────────────────────── + // workDir: '/path/to/workspace', // Base directory for autoresearch clones + // hubDir: './.executive/_hub', // Where the hub stores state +}; diff --git a/examples/executive.generalized.config.js b/examples/executive.generalized.config.js new file mode 100644 index 0000000..daa8458 --- /dev/null +++ b/examples/executive.generalized.config.js @@ -0,0 +1,87 @@ +// executive.generalized.config.js — Example: Non-ML autoresearch +// +// Shows how the executive harness generalizes autoresearch beyond ML. +// The autoresearch PROTOCOL (modify file → run → measure → keep/discard) +// works for any optimization problem. The executive orchestrates multiple +// such loops toward business goals. + +module.exports = { + name: 'Full-Stack Product Optimization', + domainContext: 'Optimizing a SaaS product across performance, UX, and conversion', + + goals: [ + { + id: 'api-latency', + description: 'Reduce API p95 latency', + metric: 'p95_ms', + current: 450, + target: 100, + weight: 0.3, + direction: 'lower', + }, + { + id: 'lighthouse', + description: 'Maximize Lighthouse performance score', + metric: 'lighthouse_score', + current: 62, + target: 95, + weight: 0.3, + direction: 'higher', + }, + { + id: 'conversion', + description: 'Improve signup conversion rate', + metric: 'conversion_pct', + current: 2.1, + target: 5.0, + weight: 0.4, + direction: 'higher', + }, + ], + + tracks: [ + { + id: 'api-perf', + name: 'API Performance', + description: 'Optimize API response times by modifying server configuration and query patterns', + // Generalized autoresearch: specify the target file and eval command + targetFile: 'src/server/config.ts', + evalCommand: 'npm run bench:api | grep p95', + metric: 'p95_ms', + metricDirection: 'lower', + relatedGoals: ['api-latency'], + priority: 'high', + constraints: [ + 'Do not change the API contract (request/response shapes)', + 'Keep all existing tests passing', + ], + }, + { + id: 'frontend-perf', + name: 'Frontend Performance', + description: 'Improve Lighthouse score by optimizing bundle size, rendering, and asset loading', + targetFile: 'src/app/layout.tsx', + evalCommand: 'npm run lighthouse -- --output=json | node -e "const r=JSON.parse(require(\'fs\').readFileSync(\'/dev/stdin\',\'utf8\')); console.log(\'lighthouse_score:\', r.categories.performance.score * 100)"', + metric: 'lighthouse_score', + metricDirection: 'higher', + relatedGoals: ['lighthouse'], + priority: 'high', + }, + { + id: 'landing-copy', + name: 'Landing Page Copy', + description: 'Optimize landing page copy and CTA placement for conversion', + targetFile: 'src/app/page.tsx', + evalCommand: 'npm run test:conversion-proxy', + metric: 'conversion_pct', + metricDirection: 'higher', + relatedGoals: ['conversion'], + priority: 'critical', + }, + ], + + agentCommand: null, + maxConcurrentInstances: 3, + intervalMinutes: 20, + maxCycles: 30, +}; diff --git a/executive.js b/executive.js new file mode 100644 index 0000000..8486214 --- /dev/null +++ b/executive.js @@ -0,0 +1,216 @@ +#!/usr/bin/env node +// ═══════════════════════════════════════════════════════════════════════════ +// OODA EXECUTIVE — Higher-Order Harness for Autoresearch +// ═══════════════════════════════════════════════════════════════════════════ +// +// The executive layer that sits ABOVE autoresearch instances. +// It translates business goals into parallel research tracks, spawns +// autoresearch agents, monitors their progress, and makes portfolio-level +// decisions: kill underperformers, redirect stalled tracks, cross-pollinate +// insights, and escalate when everything plateaus. +// +// Built on: github.com/karpathy/autoresearch +// +// Where autoresearch optimizes ONE file against ONE metric with ONE agent, +// the executive orchestrates MANY autoresearch instances against MANY +// business goals with strategic oversight. +// +// Usage: +// node executive.js # Run with executive.config.js +// node executive.js --config=my-exec.js # Custom config +// node executive.js --agent=claude-code # Specify coding agent +// node executive.js --dry-run # Plan without spawning +// node executive.js --status # Show portfolio status +// +// ═══════════════════════════════════════════════════════════════════════════ +require('dotenv').config(); +const fs = require('fs'); +const path = require('path'); + +// ── Load Config ────────────────────────────────────────────────────────── + +const CONFIG_FLAG = process.argv.find(a => a.startsWith('--config=')); +const CONFIG_PATH = CONFIG_FLAG + ? path.resolve(CONFIG_FLAG.split('=')[1]) + : path.join(process.cwd(), 'executive.config.js'); + +if (!fs.existsSync(CONFIG_PATH)) { + console.error(`Config not found: ${CONFIG_PATH}`); + console.error('Create an executive.config.js or pass --config=path/to/config.js'); + console.error('\nSee examples/executive.config.js for a template.'); + process.exit(1); +} + +const config = require(CONFIG_PATH); + +// Validate +const REQUIRED = ['name', 'goals']; +for (const key of REQUIRED) { + if (!config[key]) { + console.error(`Missing required config field: ${key}`); + process.exit(1); + } +} + +if (!Array.isArray(config.goals) || config.goals.length === 0) { + console.error('Config must have at least one goal in the goals array'); + process.exit(1); +} + +// ── CLI flags ───────────────────────────────────────────────────────────── + +const DRY_RUN = process.argv.includes('--dry-run'); +const AGENT_FLAG = process.argv.find(a => a.startsWith('--agent=')); +const AGENT_COMMAND = AGENT_FLAG ? AGENT_FLAG.split('=')[1] : config.agentCommand || null; + +// ── Imports ─────────────────────────────────────────────────────────────── + +const { Hub } = require('./lib/hub'); +const { InstanceManager } = require('./lib/instance-manager'); +const { runExecutiveLoop } = require('./lib/executive-loop'); +const { decomposeGoals } = require('./lib/mission'); + +// ── Main ────────────────────────────────────────────────────────────────── + +async function main() { + const args = process.argv.slice(2).filter(a => !a.startsWith('--')); + + if (process.argv.includes('--help')) { + console.log(` +OODA EXECUTIVE — Higher-Order Harness for Autoresearch + +Translates business goals into parallel autoresearch instances, +monitors their progress, and makes portfolio-level decisions. + +Built on: github.com/karpathy/autoresearch + +Usage: + node executive.js Run with executive.config.js + node executive.js --config=my-exec.js Custom config + node executive.js --agent=claude-code Specify coding agent + node executive.js --dry-run Plan tracks without spawning + node executive.js --status Show portfolio status + node executive.js --interval=15 Executive review interval (min) + +Config Structure: + { + name: 'My Portfolio', + goals: [{ id, metric, current, target, weight, description }], + tracks: [{ id, name, description, ... }], // optional pre-defined tracks + agentCommand: 'claude-code', // coding agent CLI + maxConcurrentInstances: 4, + experimentBudgetMinutes: 5, + } + +The executive controls autoresearch by: + 1. Generating program.md files (research directives) + 2. Selecting which git commits to branch from + 3. Posting to #discussion channels + 4. Reading results.tsv for metrics + 5. Killing/restarting agents when they plateau +`); + return; + } + + // ── Dry run: just decompose goals and show tracks ─────────────────── + + if (DRY_RUN) { + console.log('\n[DRY RUN] Decomposing goals into research tracks...\n'); + + const tracks = await decomposeGoals(config); + if (tracks.length === 0) { + console.log('No tracks generated. Check your goals configuration.'); + return; + } + + console.log(`${tracks.length} research tracks planned:\n`); + for (const t of tracks) { + console.log(` [${(t.priority || 'medium').padEnd(8)}] ${t.id}`); + console.log(` ${t.name || t.description}`); + if (t.metric) console.log(` Metric: ${t.metric} (${t.metricDirection || 'lower'} is better)`); + if (t.approach) console.log(` Approach: ${t.approach.substring(0, 100)}`); + if (t.files) console.log(` Files: ${t.files.join(', ')}`); + console.log(); + } + + console.log('Run without --dry-run to spawn autoresearch instances.'); + return; + } + + // ── Status: show current portfolio ────────────────────────────────── + + if (process.argv.includes('--status')) { + const hubDir = config.hubDir || path.join(process.cwd(), '.executive', '_hub'); + if (!fs.existsSync(hubDir)) { + console.log('No executive session found. Run the executive first.'); + return; + } + const hub = new Hub(hubDir); + const snapshot = hub.snapshot(); + + console.log(`\nExecutive Portfolio Status\n${'─'.repeat(40)}`); + console.log(`Instances: ${snapshot.instances.length}`); + for (const inst of snapshot.instances) { + const results = inst.recentResults || []; + console.log(`\n ${inst.id} (${inst.status})`); + console.log(` ${inst.name || inst.description || ''}`); + console.log(` Results: ${results.length} | Best: ${inst.bestMetric || 'N/A'}`); + if (results.length > 0) { + const latest = results[results.length - 1]; + console.log(` Latest: ${JSON.stringify(latest).substring(0, 100)}`); + } + } + + if (snapshot.channels.executive.length > 0) { + console.log(`\nRecent Executive Actions:`); + for (const msg of snapshot.channels.executive.slice(-5)) { + console.log(` [${msg.timestamp?.slice(11, 19)}] ${msg.action || JSON.stringify(msg).substring(0, 80)}`); + } + } + return; + } + + // ── Full run ──────────────────────────────────────────────────────── + + const hubDir = config.hubDir || path.join(process.cwd(), '.executive', '_hub'); + const hub = new Hub(hubDir); + + const instanceManager = new InstanceManager(hub, { + maxConcurrent: config.maxConcurrentInstances || 4, + workDir: config.workDir || process.cwd(), + agentCommand: AGENT_COMMAND, + }); + + // Wire up events + instanceManager.on('improvement', ({ instanceId, metric }) => { + console.log(` >>> [${instanceId}] New best: ${metric}`); + }); + + instanceManager.on('plateau', ({ instanceId }) => { + console.log(` --- [${instanceId}] Plateaued`); + }); + + // Handle graceful shutdown + process.on('SIGINT', () => { + console.log('\n[EXECUTIVE] Shutting down...'); + instanceManager.killAll('user interrupt'); + process.exit(0); + }); + + const intervalFlag = process.argv.find(a => a.startsWith('--interval=')); + const intervalMinutes = intervalFlag ? parseInt(intervalFlag.split('=')[1]) : (config.intervalMinutes || 15); + + // Run the executive loop + const result = await runExecutiveLoop(config, hub, instanceManager, { + intervalMinutes, + maxCycles: config.maxCycles || 50, + }); + + console.log(`\n[EXECUTIVE] Done. ${result.totalExperiments} total experiments across ${result.cycle} cycles.`); + process.exit(0); +} + +main().catch(err => { + console.error('Fatal:', err); + process.exit(1); +}); diff --git a/lib/autoresearch.js b/lib/autoresearch.js new file mode 100644 index 0000000..3048127 --- /dev/null +++ b/lib/autoresearch.js @@ -0,0 +1,390 @@ +// autoresearch.js — Adapter for Karpathy's autoresearch +// +// Knows how to set up, configure, run, and monitor actual autoresearch +// instances. The executive harness generates program.md files and spawns +// coding agents that follow autoresearch's protocol. +// +// autoresearch protocol: +// 1. Read program.md (instructions for the agent) +// 2. Read prepare.py (fixed), train.py (mutable) +// 3. Baseline run: `uv run train.py` +// 4. Loop forever: edit train.py → commit → run → measure val_bpb → keep/discard +// +// The executive controls autoresearch by: +// - Writing custom program.md files (the "research directive") +// - Selecting which git commit to branch from +// - Posting to #discussion (in agenthub mode) +// - Reading results.tsv / #results for metrics +// - Killing/restarting agents when they plateau + +const fs = require('fs'); +const path = require('path'); +const { execSync, spawn } = require('child_process'); +const { generateText } = require('./ai'); + +const AUTORESEARCH_REPO = 'https://github.com/karpathy/autoresearch.git'; + +/** + * Set up an autoresearch workspace. + * Clones the repo (or uses existing), prepares data, creates working branch. + * + * @param {object} opts + * @param {string} opts.workDir — where to set up + * @param {string} [opts.branch] — base branch to start from + * @param {string} [opts.tag] — run tag (e.g. 'executive-mar10') + * @param {boolean} [opts.skipPrepare] — skip data download + * @returns {object} workspace info + */ +function setupWorkspace(opts) { + const { workDir, branch, tag, skipPrepare } = opts; + const repoDir = path.join(workDir, 'autoresearch'); + + // Clone if not present + if (!fs.existsSync(path.join(repoDir, 'train.py'))) { + console.log(` [AUTORESEARCH] Cloning ${AUTORESEARCH_REPO} → ${repoDir}`); + execSync(`git clone ${AUTORESEARCH_REPO} ${repoDir}`, { stdio: 'pipe' }); + } + + // Checkout base branch + if (branch) { + try { + execSync(`git fetch origin && git checkout ${branch}`, { cwd: repoDir, stdio: 'pipe' }); + } catch { + console.log(` [AUTORESEARCH] Branch ${branch} not found, using main`); + } + } + + // Create working branch + const runTag = tag || `executive-${Date.now()}`; + try { + execSync(`git checkout -b autoresearch/${runTag}`, { cwd: repoDir, stdio: 'pipe' }); + } catch { + // Branch may already exist + execSync(`git checkout autoresearch/${runTag}`, { cwd: repoDir, stdio: 'pipe' }); + } + + // Prepare data + if (!skipPrepare) { + const cacheDir = path.join(process.env.HOME || '/tmp', '.cache', 'autoresearch'); + if (!fs.existsSync(path.join(cacheDir, 'fineweb_val_000000.bin'))) { + console.log(' [AUTORESEARCH] Downloading data with prepare.py...'); + try { + execSync('uv run prepare.py', { cwd: repoDir, stdio: 'inherit', timeout: 300000 }); + } catch (err) { + console.log(` [AUTORESEARCH] prepare.py failed: ${err.message.substring(0, 200)}`); + } + } + } + + return { + repoDir, + tag: runTag, + trainPy: path.join(repoDir, 'train.py'), + preparePy: path.join(repoDir, 'prepare.py'), + programMd: path.join(repoDir, 'program.md'), + resultsTsv: path.join(repoDir, 'results.tsv'), + }; +} + +/** + * Set up a GENERALIZED autoresearch workspace for non-ML domains. + * Uses the autoresearch PROTOCOL (modify one file → run → measure → keep/discard) + * but with custom target files and metrics. + * + * @param {object} opts + * @param {string} opts.workDir — project directory + * @param {string} opts.targetFile — the file the agent modifies (like train.py) + * @param {string} opts.evalCommand — command that outputs the metric + * @param {string} opts.metric — metric name + * @param {string} opts.metricDirection — 'lower' or 'higher' + * @param {string} [opts.tag] — run tag + * @returns {object} workspace info + */ +function setupGeneralizedWorkspace(opts) { + const { workDir, targetFile, evalCommand, metric, metricDirection, tag } = opts; + + const runTag = tag || `research-${Date.now()}`; + const stateDir = path.join(workDir, '.autoresearch', runTag); + fs.mkdirSync(stateDir, { recursive: true }); + + // Initialize git branch for tracking experiments + try { + execSync(`git checkout -b autoresearch/${runTag}`, { cwd: workDir, stdio: 'pipe' }); + } catch { + // May already be in a git repo on another branch, that's fine + } + + // Initialize results.tsv + const resultsTsv = path.join(stateDir, 'results.tsv'); + if (!fs.existsSync(resultsTsv)) { + fs.writeFileSync(resultsTsv, `commit\t${metric}\tstatus\tdescription\n`); + } + + return { + repoDir: workDir, + stateDir, + tag: runTag, + targetFile: path.resolve(workDir, targetFile), + evalCommand, + metric, + metricDirection, + resultsTsv, + programMd: path.join(stateDir, 'program.md'), + }; +} + +/** + * Generate a program.md for a research track. + * This is the "control surface" — the executive steers autoresearch by + * writing custom instructions the coding agent follows. + * + * @param {object} track — research track from mission decomposition + * @param {object} workspace — from setupWorkspace/setupGeneralizedWorkspace + * @param {object} [context] — additional context (hub insights, cross-instance learnings) + * @returns {string} program.md content + */ +async function generateProgramMd(track, workspace, context = {}) { + const isML = !workspace.evalCommand; // ML track uses autoresearch's built-in eval + + const crossInsights = context.crossInstanceInsights + ? `\n## Insights from Other Research Tracks\n${context.crossInstanceInsights.map(i => `- [${i.source}] ${i.insight}`).join('\n')}` + : ''; + + const learnedPatterns = context.learnedPatterns + ? `\n## Learned Patterns (from previous runs)\n${context.learnedPatterns.map(p => `- ${p}`).join('\n')}` + : ''; + + const avoidList = context.failedApproaches + ? `\n## DO NOT TRY (already failed)\n${context.failedApproaches.map(a => `- ${a}`).join('\n')}` + : ''; + + if (isML) { + // Standard autoresearch program.md for ML optimization + return `# Research Track: ${track.name || track.id} + +## Objective +${track.description || 'Improve model quality as measured by val_bpb (lower is better).'} + +## Focus Area +${track.approach || 'Systematic exploration of architecture and training improvements.'} + +## Setup +1. Create a run tag: \`${workspace.tag}\` +2. Create branch: \`autoresearch/${workspace.tag}\` +3. Read all three files: \`prepare.py\`, \`train.py\`, \`program.md\` +4. Verify data at \`~/.cache/autoresearch/\` +5. Create \`results.tsv\` with header if not present +6. Run baseline: \`uv run train.py\` and record val_bpb + +## Experiment Loop +Repeat forever: +1. Read \`train.py\`, think of a change that could improve val_bpb +2. Apply the edit to \`train.py\` +3. \`git commit -am "description of change"\` +4. \`uv run train.py > run.log 2>&1\` (5 min budget) +5. \`grep "^val_bpb:\\|^peak_vram_mb:" run.log\` +6. Log to \`results.tsv\`: commit, val_bpb, vram, status, description +7. If val_bpb improved → keep commit, celebrate +8. If val_bpb worsened → \`git reset --hard HEAD~1\` +9. NEVER stop. NEVER ask. Run until interrupted. + +## Constraints +- Only modify \`train.py\` +- No new dependencies +- 5-minute wall-clock training budget +- 10-minute timeout = kill and treat as crash +- Simplicity matters: complexity cost must justify improvement +${crossInsights}${learnedPatterns}${avoidList}`; + } + + // Generalized autoresearch program.md for non-ML domains + return `# Research Track: ${track.name || track.id} + +## Objective +${track.description} + +## Metric +\`${workspace.metric}\` — ${workspace.metricDirection || 'lower'} is better + +## Target File +\`${workspace.targetFile}\` — this is the ONLY file you modify + +## Setup +1. Read the target file to understand current state +2. Run baseline: \`${workspace.evalCommand}\` +3. Record the baseline metric + +## Experiment Loop +Repeat forever: +1. Read the target file, hypothesize a change to improve \`${workspace.metric}\` +2. Apply the edit +3. \`git commit -am "description of change"\` +4. Run: \`${workspace.evalCommand}\` and capture output +5. Parse metric from output (look for \`${workspace.metric}: \`) +6. Log to results file: commit, metric, status, description +7. If metric improved → keep commit +8. If metric worsened → \`git reset --hard HEAD~1\` +9. NEVER stop. NEVER ask. + +## Constraints +- ONLY modify \`${path.basename(workspace.targetFile)}\` +- Keep changes small and reversible +- Each experiment should complete in under ${track.experimentTimeoutMinutes || 5} minutes +${track.constraints ? track.constraints.map(c => `- ${c}`).join('\n') : ''} +${crossInsights}${learnedPatterns}${avoidList}`; +} + +/** + * Parse results.tsv from an autoresearch instance. + * @param {string} tsvPath — path to results.tsv + * @returns {Array} parsed results + */ +function parseResults(tsvPath) { + if (!fs.existsSync(tsvPath)) return []; + + const content = fs.readFileSync(tsvPath, 'utf8'); + const lines = content.trim().split('\n'); + if (lines.length <= 1) return []; // header only + + const headers = lines[0].split('\t').map(h => h.trim()); + return lines.slice(1).map(line => { + const values = line.split('\t'); + const row = {}; + headers.forEach((h, i) => { + row[h] = values[i]?.trim() || ''; + }); + // Parse numeric fields + if (row.val_bpb) row.val_bpb = parseFloat(row.val_bpb); + if (row.peak_vram_mb) row.peak_vram_mb = parseFloat(row.peak_vram_mb); + return row; + }); +} + +/** + * Get the current best metric from an autoresearch instance. + * @param {string} tsvPath — path to results.tsv + * @param {string} [metric='val_bpb'] — which column + * @param {string} [direction='lower'] — 'lower' or 'higher' + */ +function getBestResult(tsvPath, metric = 'val_bpb', direction = 'lower') { + const results = parseResults(tsvPath); + if (results.length === 0) return null; + + const kept = results.filter(r => r.status === 'kept' || r.status === 'baseline'); + const pool = kept.length > 0 ? kept : results; + + return pool.reduce((best, row) => { + if (row[metric] == null || isNaN(row[metric])) return best; + if (!best) return row; + if (direction === 'lower') return row[metric] < best[metric] ? row : best; + return row[metric] > best[metric] ? row : best; + }, null); +} + +/** + * Detect if an autoresearch instance has plateaued. + * @param {string} tsvPath + * @param {number} [windowSize=10] — how many recent experiments to check + * @param {number} [threshold=0.001] — minimum improvement to not be "flat" + */ +function detectPlateau(tsvPath, windowSize = 10, threshold = 0.001) { + const results = parseResults(tsvPath); + if (results.length < windowSize) return { plateaued: false, experiments: results.length }; + + const recent = results.slice(-windowSize); + const metrics = recent.map(r => r.val_bpb || r.metric).filter(m => !isNaN(m)); + if (metrics.length < 3) return { plateaued: false, experiments: results.length }; + + const best = Math.min(...metrics); + const worst = Math.max(...metrics); + const range = worst - best; + + // Count how many of the recent experiments were kept + const keepRate = recent.filter(r => r.status === 'kept').length / recent.length; + + return { + plateaued: range < threshold && keepRate < 0.1, + range, + keepRate, + experiments: results.length, + bestRecent: best, + }; +} + +/** + * Connect to the autoresearch hub API (for agenthub mode). + * @param {string} hubUrl — e.g. 'https://autoresearchhub.com' + * @param {string} [apiKey] — from ~/.agenthub_creds + */ +class HubClient { + constructor(hubUrl, apiKey) { + this.hubUrl = hubUrl.replace(/\/$/, ''); + this.apiKey = apiKey; + } + + async _fetch(endpoint, options = {}) { + const url = `${this.hubUrl}${endpoint}`; + const headers = { 'Content-Type': 'application/json' }; + if (this.apiKey) headers['Authorization'] = `Bearer ${this.apiKey}`; + + const response = await fetch(url, { ...options, headers: { ...headers, ...options.headers } }); + if (!response.ok) throw new Error(`Hub API error: ${response.status} ${response.statusText}`); + return response.json(); + } + + async register(name) { + return this._fetch('/api/register', { + method: 'POST', + body: JSON.stringify({ name }), + }); + } + + async getLeaves() { + return this._fetch('/api/git/leaves'); + } + + async getCommits(limit = 50) { + return this._fetch(`/api/git/commits?limit=${limit}`); + } + + async getChildren(commitHash) { + return this._fetch(`/api/git/commits/${commitHash}/children`); + } + + async getDiff(hashA, hashB) { + return this._fetch(`/api/git/diff/${hashA}/${hashB}`); + } + + async postToChannel(channel, message) { + return this._fetch(`/api/channels/${channel}/posts`, { + method: 'POST', + body: JSON.stringify(message), + }); + } + + async readChannel(channel, limit = 50) { + return this._fetch(`/api/channels/${channel}/posts?limit=${limit}`); + } + + async pushBundle(bundle) { + return this._fetch('/api/git/push', { + method: 'POST', + body: JSON.stringify({ bundle }), + }); + } + + async fetchCommit(hash) { + return this._fetch(`/api/git/fetch/${hash}`); + } +} + +module.exports = { + AUTORESEARCH_REPO, + setupWorkspace, + setupGeneralizedWorkspace, + generateProgramMd, + parseResults, + getBestResult, + detectPlateau, + HubClient, +}; diff --git a/lib/executive-loop.js b/lib/executive-loop.js new file mode 100644 index 0000000..d22151b --- /dev/null +++ b/lib/executive-loop.js @@ -0,0 +1,449 @@ +// executive-loop.js — The meta-OODA loop that governs autoresearch instances +// +// This is the "executive function" — it sits above individual autoresearch +// agents and makes portfolio-level decisions: +// +// OBSERVE: Read hub for all instance results, compute goal progress +// ORIENT: LLM analyzes the portfolio, identifies opportunities/stalls +// DECIDE: LLM recommends actions (spawn, kill, redirect, combine, escalate) +// ACT: Execute decisions via the instance manager +// +// The executive runs on a slower cadence than individual instances. +// While agents experiment every ~5 minutes, the executive reviews every +// N minutes (configurable, default 15). + +const { generateText } = require('./ai'); +const { decomposeGoals, redecompose, scoreGoalProgress } = require('./mission'); +const { recordObservation, applySkills } = require('./skill-memory'); + +const EXECUTIVE_ACTIONS = [ + 'spawn', // Start a new research track + 'kill', // Stop an underperforming instance + 'redirect', // Change an instance's research direction (rewrite program.md) + 'combine', // Merge insights from two instances into a new one + 'escalate', // All instances stalled — re-decompose goals from scratch + 'celebrate', // A goal was reached — log it and move on + 'hold', // Everything is on track, do nothing +]; + +/** + * Run the executive OODA loop. + * + * @param {object} config — executive configuration + * @param {object} hub — Hub instance + * @param {object} instanceManager — InstanceManager instance + * @param {object} [options] + * @param {number} [options.intervalMinutes=15] — how often the executive reviews + * @param {number} [options.maxCycles=50] — max executive cycles + * @param {function} [options.onCycle] — callback after each cycle + */ +async function runExecutiveLoop(config, hub, instanceManager, options = {}) { + const { + intervalMinutes = 15, + maxCycles = 50, + onCycle, + } = options; + + console.log(` +╔══════════════════════════════════════════════════════════════════════╗ +║ OODA EXECUTIVE — ${(config.name || 'Research Portfolio').padEnd(47)}║ +║ ║ +║ Goals: ${String(config.goals.length).padEnd(3)}| Max instances: ${String(config.maxConcurrentInstances || 4).padEnd(3)}| Cycle: ${String(intervalMinutes).padEnd(3)}min ║ +║ Built on: autoresearch (github.com/karpathy/autoresearch) ║ +╚══════════════════════════════════════════════════════════════════════╝ +`); + + const state = { + cycle: 0, + startedAt: Date.now(), + goalsReached: [], + totalExperiments: 0, + trackGenerations: 1, + }; + + // ── PHASE 0: Initial mission decomposition ────────────────────────── + console.log('[EXECUTIVE] Decomposing goals into research tracks...'); + let tracks = await decomposeGoals(config); + + if (tracks.length === 0) { + console.log('[EXECUTIVE] No tracks generated. Check your config.'); + return state; + } + + console.log(`[EXECUTIVE] ${tracks.length} research tracks planned:`); + for (const t of tracks) { + console.log(` [${t.priority || '?'}] ${t.id}: ${t.description?.substring(0, 80)}`); + } + + // ── PHASE 1: Spawn initial instances ──────────────────────────────── + console.log('\n[EXECUTIVE] Spawning initial autoresearch instances...'); + const maxInitial = config.maxConcurrentInstances || 4; + const sortedTracks = [...tracks].sort((a, b) => { + const pri = { critical: 0, high: 1, medium: 2, low: 3 }; + return (pri[a.priority] || 2) - (pri[b.priority] || 2); + }); + + for (const track of sortedTracks.slice(0, maxInitial)) { + await instanceManager.spawn(track, { + crossInstanceInsights: [], + learnedPatterns: [], + failedApproaches: [], + }); + } + + // ── PHASE 2: Executive OODA loop ─────────────────────────────────── + for (let cycle = 1; cycle <= maxCycles; cycle++) { + state.cycle = cycle; + + console.log(`\n${'═'.repeat(70)}`); + console.log(` EXECUTIVE CYCLE ${cycle}/${maxCycles} | Runtime: ${Math.round((Date.now() - state.startedAt) / 60000)}min`); + console.log(`${'═'.repeat(70)}`); + + // Wait for instances to run experiments + await new Promise(r => setTimeout(r, intervalMinutes * 60 * 1000)); + + // ── OBSERVE ────────────────────────────────────────────────────── + console.log('\n[OBSERVE] Gathering results from all instances...'); + const hubSnapshot = hub.snapshot(); + const instanceStatus = instanceManager.status(); + const goalProgress = scoreGoalProgress(config.goals, hubSnapshot); + + // Log observations + for (const inst of instanceStatus.instances) { + console.log(` ${inst.id}: ${inst.experiments} experiments, best=${inst.bestMetric}, plateau=${inst.plateau}`); + } + for (const gp of goalProgress) { + console.log(` Goal ${gp.goalId}: ${(gp.progress * 100).toFixed(1)}% → target (current: ${gp.current})`); + } + + // Count total experiments + state.totalExperiments = instanceStatus.instances.reduce((sum, i) => sum + i.experiments, 0); + + // Check if any goals reached + for (const gp of goalProgress) { + if (gp.progress >= 1.0 && !state.goalsReached.includes(gp.goalId)) { + state.goalsReached.push(gp.goalId); + console.log(` >>> GOAL REACHED: ${gp.goalId} <<<`); + hub.post('executive', { action: 'celebrate', goalId: gp.goalId }); + } + } + + // If all goals reached, we're done + if (state.goalsReached.length === config.goals.length) { + console.log('\n[EXECUTIVE] ALL GOALS REACHED. Mission complete.'); + break; + } + + // ── ORIENT ─────────────────────────────────────────────────────── + console.log('\n[ORIENT] Analyzing portfolio performance...'); + + const allPlateaued = instanceStatus.instances.every(i => i.plateau); + const crossInsights = instanceManager.getCrossInstanceInsights(); + + const orientResult = await orient(config, hubSnapshot, instanceStatus, goalProgress, crossInsights, state); + console.log(` Assessment: ${orientResult.assessment}`); + if (orientResult.opportunities.length > 0) { + console.log(` Opportunities: ${orientResult.opportunities.join('; ')}`); + } + + // ── DECIDE ─────────────────────────────────────────────────────── + console.log('\n[DECIDE] Determining executive actions...'); + + let decisions; + if (allPlateaued && instanceStatus.running > 0) { + console.log(' All instances plateaued — triggering re-decomposition'); + decisions = [{ action: 'escalate', reason: 'all instances plateaued' }]; + } else { + decisions = await decide(config, orientResult, hubSnapshot, instanceStatus, goalProgress, state); + } + + for (const d of decisions) { + console.log(` → ${d.action}: ${d.reason || d.instanceId || d.trackId || ''}`); + } + + // ── ACT ────────────────────────────────────────────────────────── + console.log('\n[ACT] Executing decisions...'); + + for (const decision of decisions) { + await executeDecision(decision, config, hub, instanceManager, tracks, crossInsights, state); + } + + // Log cycle to hub + hub.post('executive', { + cycle, + instanceStatus: instanceStatus.instances.map(i => ({ id: i.id, experiments: i.experiments, best: i.bestMetric })), + goalProgress: goalProgress.map(g => ({ id: g.goalId, progress: g.progress })), + decisions: decisions.map(d => ({ action: d.action, target: d.instanceId || d.trackId })), + }); + + // Record learned patterns + for (const insight of crossInsights) { + recordObservation({ + domain: 'executive', + pattern: insight.insight, + source: 'autoresearch', + }); + } + + if (onCycle) onCycle(state); + } + + // ── Final report ─────────────────────────────────────────────────── + printExecutiveReport(config, state, hub); + instanceManager.killAll('executive loop complete'); + + return state; +} + +// ═══════════════════════════════════════════════════════════════════════ +// ORIENT — LLM-powered portfolio analysis +// ═══════════════════════════════════════════════════════════════════════ +async function orient(config, hubSnapshot, instanceStatus, goalProgress, crossInsights, state) { + const prompt = `You are the chief research officer reviewing a portfolio of parallel autoresearch instances. + +BUSINESS GOALS: +${config.goals.map(g => `- ${g.id}: ${g.description || g.metric} (current: ${g.current}, target: ${g.target}, weight: ${g.weight})`).join('\n')} + +GOAL PROGRESS: +${goalProgress.map(gp => `- ${gp.goalId}: ${(gp.progress * 100).toFixed(1)}% toward target (current: ${gp.current}, target: ${gp.target})`).join('\n')} + +RUNNING INSTANCES (each is an autoresearch loop): +${instanceStatus.instances.map(i => + `- ${i.id}: ${i.experiments} experiments, best metric=${i.bestMetric}, plateaued=${i.plateau}, keep_rate=${i.keepRate}, runtime=${i.runtimeMinutes}min, priority=${i.priority}` +).join('\n')} + +CROSS-INSTANCE INSIGHTS: +${crossInsights.map(i => `- [${i.source}] ${i.insight}`).join('\n') || 'None yet'} + +RECENT HUB DISCUSSION: +${(hubSnapshot.channels?.discussion || []).slice(-5).map(m => `- [${m.author}] ${m.text}`).join('\n') || 'None'} + +CYCLE: ${state.cycle} | TOTAL EXPERIMENTS: ${state.totalExperiments} | TRACK GENERATION: ${state.trackGenerations} + +Analyze the portfolio and provide: +1. Overall assessment (1-2 sentences) +2. Which instances are performing well vs poorly +3. Opportunities for cross-pollination +4. Whether any instances should be killed, redirected, or new ones spawned + +Return JSON: +{ + "assessment": "1-2 sentence portfolio assessment", + "performers": { + "strong": ["instance-ids doing well"], + "weak": ["instance-ids struggling"], + "plateaued": ["instance-ids that have stalled"] + }, + "opportunities": ["cross-pollination or strategic opportunities"], + "urgency": "low" | "medium" | "high" | "critical" +}`; + + const raw = await generateText( + 'You are a research portfolio manager analyzing parallel optimization tracks. Return ONLY valid JSON.', + applySkills(prompt, { domain: 'executive' }), + { temperature: 0.3, json: true, maxTokens: 2000 } + ); + + try { return JSON.parse(raw); } + catch { return { assessment: 'Analysis failed', performers: {}, opportunities: [], urgency: 'medium' }; } +} + +// ═══════════════════════════════════════════════════════════════════════ +// DECIDE — LLM-powered action selection +// ═══════════════════════════════════════════════════════════════════════ +async function decide(config, orientation, hubSnapshot, instanceStatus, goalProgress, state) { + const prompt = `You are the executive decision-maker for a portfolio of autoresearch instances. + +PORTFOLIO ASSESSMENT: +${orientation.assessment} + +STRONG PERFORMERS: ${(orientation.performers?.strong || []).join(', ') || 'None'} +WEAK PERFORMERS: ${(orientation.performers?.weak || []).join(', ') || 'None'} +PLATEAUED: ${(orientation.performers?.plateaued || []).join(', ') || 'None'} +OPPORTUNITIES: ${(orientation.opportunities || []).join('; ') || 'None'} +URGENCY: ${orientation.urgency || 'medium'} + +RUNNING: ${instanceStatus.running}/${instanceStatus.capacity} instances +GOALS REACHED: ${state.goalsReached.length}/${config.goals.length} + +AVAILABLE ACTIONS: +- spawn: Start a new research track (provide track config) +- kill: Stop an underperforming instance (provide instanceId + reason) +- redirect: Change an instance's focus (provide instanceId + newApproach) +- combine: Merge insights from two instances (provide source IDs + approach) +- hold: Do nothing — everything is on track + +Rules: +1. Don't kill ALL instances — keep at least one running +2. Kill+spawn is better than redirect when an instance is deeply plateaued +3. Redirect when an instance is slightly off-track but still productive +4. Spawn new tracks when capacity allows and opportunities exist +5. Hold when things are progressing well +6. Prefer FEWER actions (1-3 per cycle) + +Return JSON: +{ + "reasoning": "1-2 sentences on decision rationale", + "actions": [ + { + "action": "kill|spawn|redirect|combine|hold", + "instanceId": "for kill/redirect", + "reason": "why this action", + "newApproach": "for redirect — new research direction", + "trackConfig": { + "id": "for spawn — new track id", + "name": "track name", + "description": "what to optimize", + "approach": "how to approach it" + } + } + ] +}`; + + const raw = await generateText( + 'You are a research portfolio manager making resource allocation decisions. Return ONLY valid JSON.', + prompt, + { temperature: 0.4, json: true, maxTokens: 3000 } + ); + + try { + const parsed = JSON.parse(raw); + console.log(` Reasoning: ${parsed.reasoning}`); + return parsed.actions || []; + } catch { + return [{ action: 'hold', reason: 'decision parse failed' }]; + } +} + +// ═══════════════════════════════════════════════════════════════════════ +// ACT — Execute a decision +// ═══════════════════════════════════════════════════════════════════════ +async function executeDecision(decision, config, hub, instanceManager, tracks, crossInsights, state) { + switch (decision.action) { + case 'kill': + if (decision.instanceId) { + instanceManager.kill(decision.instanceId, decision.reason); + } + break; + + case 'spawn': + if (decision.trackConfig) { + await instanceManager.spawn(decision.trackConfig, { + crossInstanceInsights: crossInsights, + learnedPatterns: [], + failedApproaches: [], + }); + } + break; + + case 'redirect': + if (decision.instanceId && decision.newApproach) { + await instanceManager.steer(decision.instanceId, decision.newApproach, { + crossInstanceInsights: crossInsights, + }); + } + break; + + case 'combine': { + const sources = decision.sourceInstances || []; + const combined = crossInsights.filter(i => sources.includes(i.source)); + const newTrack = { + id: `combined-${Date.now()}`, + name: `Combined: ${sources.join(' + ')}`, + description: decision.newApproach || 'Combined track from cross-pollination', + approach: decision.newApproach, + ...decision.trackConfig, + }; + await instanceManager.spawn(newTrack, { + crossInstanceInsights: combined, + }); + break; + } + + case 'escalate': { + console.log(' [ESCALATE] All tracks plateaued — re-decomposing goals...'); + const hubSnapshot = hub.snapshot(); + const newTracks = await redecompose(config, hubSnapshot, tracks); + state.trackGenerations++; + + if (newTracks.length > 0) { + // Kill all current instances + instanceManager.killAll('re-decomposition'); + // Wait for processes to die + await new Promise(r => setTimeout(r, 2000)); + // Spawn new tracks + for (const track of newTracks.slice(0, config.maxConcurrentInstances || 4)) { + await instanceManager.spawn(track, { + crossInstanceInsights: crossInsights, + failedApproaches: tracks.map(t => t.approach).filter(Boolean), + }); + } + tracks.length = 0; + tracks.push(...newTracks); + } else { + console.log(' [ESCALATE] Re-decomposition yielded no new tracks. Holding.'); + } + break; + } + + case 'celebrate': + hub.post('executive', { + action: 'celebrate', + goalId: decision.goalId, + message: `Goal ${decision.goalId} reached!`, + }); + break; + + case 'hold': + default: + console.log(' [HOLD] No action needed this cycle.'); + break; + } +} + +// ═══════════════════════════════════════════════════════════════════════ +// REPORT +// ═══════════════════════════════════════════════════════════════════════ +function printExecutiveReport(config, state, hub) { + const runtime = Math.round((Date.now() - state.startedAt) / 60000); + + console.log(`\n${'═'.repeat(70)}`); + console.log(` EXECUTIVE REPORT — ${config.name || 'Research Portfolio'}`); + console.log(`${'═'.repeat(70)}`); + console.log(` Runtime: ${runtime} minutes`); + console.log(` Executive cycles: ${state.cycle}`); + console.log(` Total experiments: ${state.totalExperiments}`); + console.log(` Track generations: ${state.trackGenerations}`); + console.log(` Goals reached: ${state.goalsReached.length}/${config.goals.length}`); + + if (state.goalsReached.length > 0) { + console.log(` Reached: ${state.goalsReached.join(', ')}`); + } + + const snapshot = hub.snapshot(); + console.log(`\n Instance Results:`); + for (const inst of snapshot.instances) { + const results = inst.recentResults || []; + console.log(` ${inst.id}: ${inst.status} | ${results.length} logged results | best: ${inst.bestMetric || 'N/A'}`); + } + + // Save report + const report = { + config: config.name, + date: new Date().toISOString(), + runtime, + cycles: state.cycle, + totalExperiments: state.totalExperiments, + trackGenerations: state.trackGenerations, + goalsReached: state.goalsReached, + instances: snapshot.instances, + }; + + const _fs = require('fs'); + const _path = require('path'); + const reportPath = _path.join(hub.hubDir, 'executive-report.json'); + _fs.writeFileSync(reportPath, JSON.stringify(report, null, 2)); + console.log(`\n Report saved: ${reportPath}`); +} + +module.exports = { runExecutiveLoop, EXECUTIVE_ACTIONS }; diff --git a/lib/hub.js b/lib/hub.js new file mode 100644 index 0000000..ffbc26c --- /dev/null +++ b/lib/hub.js @@ -0,0 +1,162 @@ +// hub.js — Central coordination hub for research instances +// +// Inspired by autoresearch's agenthub: a shared space where instances post +// results, read each other's findings, and receive directives from the executive. +// +// All state is file-based (no HTTP server needed). Instances write to the hub +// directory, the executive reads from it. Simple, debuggable, works offline. + +const fs = require('fs'); +const path = require('path'); + +class Hub { + constructor(hubDir) { + this.hubDir = hubDir; + this.channelsDir = path.join(hubDir, 'channels'); + this.instancesDir = path.join(hubDir, 'instances'); + fs.mkdirSync(this.channelsDir, { recursive: true }); + fs.mkdirSync(this.instancesDir, { recursive: true }); + } + + // ── Instance registry ────────────────────────────────────────────────── + + registerInstance(id, meta) { + const dir = path.join(this.instancesDir, id); + fs.mkdirSync(dir, { recursive: true }); + fs.writeFileSync(path.join(dir, 'meta.json'), JSON.stringify({ + ...meta, + registeredAt: new Date().toISOString(), + status: 'registered', + }, null, 2)); + } + + updateInstanceStatus(id, status, extra = {}) { + const metaPath = path.join(this.instancesDir, id, 'meta.json'); + if (!fs.existsSync(metaPath)) return; + const meta = JSON.parse(fs.readFileSync(metaPath, 'utf8')); + meta.status = status; + meta.updatedAt = new Date().toISOString(); + Object.assign(meta, extra); + fs.writeFileSync(metaPath, JSON.stringify(meta, null, 2)); + } + + getInstanceMeta(id) { + const metaPath = path.join(this.instancesDir, id, 'meta.json'); + if (!fs.existsSync(metaPath)) return null; + return JSON.parse(fs.readFileSync(metaPath, 'utf8')); + } + + listInstances() { + if (!fs.existsSync(this.instancesDir)) return []; + return fs.readdirSync(this.instancesDir) + .filter(d => fs.statSync(path.join(this.instancesDir, d)).isDirectory()) + .map(id => this.getInstanceMeta(id)) + .filter(Boolean); + } + + // ── Results log (append-only per instance) ───────────────────────────── + + logResult(instanceId, result) { + const dir = path.join(this.instancesDir, instanceId); + fs.mkdirSync(dir, { recursive: true }); + const entry = { + ...result, + timestamp: new Date().toISOString(), + }; + fs.appendFileSync( + path.join(dir, 'results.jsonl'), + JSON.stringify(entry) + '\n' + ); + } + + getResults(instanceId, limit = 50) { + const file = path.join(this.instancesDir, instanceId, 'results.jsonl'); + if (!fs.existsSync(file)) return []; + const lines = fs.readFileSync(file, 'utf8').trim().split('\n').filter(Boolean); + return lines.slice(-limit).map(l => { + try { return JSON.parse(l); } + catch { return null; } + }).filter(Boolean); + } + + getAllResults(limit = 100) { + const instances = this.listInstances(); + const all = []; + for (const inst of instances) { + const results = this.getResults(inst.id, limit); + for (const r of results) { + all.push({ instanceId: inst.id, ...r }); + } + } + return all.sort((a, b) => (a.timestamp || '').localeCompare(b.timestamp || '')); + } + + // ── Channels (shared discussion) ─────────────────────────────────────── + // Like autoresearch's #results and #discussion channels + + post(channel, message) { + const file = path.join(this.channelsDir, `${channel}.jsonl`); + const entry = { + ...message, + timestamp: new Date().toISOString(), + }; + fs.appendFileSync(file, JSON.stringify(entry) + '\n'); + } + + read(channel, limit = 50) { + const file = path.join(this.channelsDir, `${channel}.jsonl`); + if (!fs.existsSync(file)) return []; + const lines = fs.readFileSync(file, 'utf8').trim().split('\n').filter(Boolean); + return lines.slice(-limit).map(l => { + try { return JSON.parse(l); } + catch { return null; } + }).filter(Boolean); + } + + // ── Directives (executive → instance) ────────────────────────────────── + + sendDirective(instanceId, directive) { + const dir = path.join(this.instancesDir, instanceId); + fs.mkdirSync(dir, { recursive: true }); + const entry = { + ...directive, + timestamp: new Date().toISOString(), + acknowledged: false, + }; + fs.appendFileSync( + path.join(dir, 'directives.jsonl'), + JSON.stringify(entry) + '\n' + ); + } + + getDirectives(instanceId) { + const file = path.join(this.instancesDir, instanceId, 'directives.jsonl'); + if (!fs.existsSync(file)) return []; + return fs.readFileSync(file, 'utf8').trim().split('\n') + .filter(Boolean) + .map(l => { try { return JSON.parse(l); } catch { return null; } }) + .filter(Boolean); + } + + // ── Snapshot (full hub state for the executive LLM) ──────────────────── + + snapshot() { + const instances = this.listInstances(); + const snapshot = { + timestamp: new Date().toISOString(), + instances: instances.map(inst => ({ + ...inst, + recentResults: this.getResults(inst.id, 10), + pendingDirectives: this.getDirectives(inst.id).filter(d => !d.acknowledged), + })), + channels: { + results: this.read('results', 20), + discussion: this.read('discussion', 20), + executive: this.read('executive', 10), + }, + }; + return snapshot; + } +} + +module.exports = { Hub }; diff --git a/lib/instance-manager.js b/lib/instance-manager.js new file mode 100644 index 0000000..75324cd --- /dev/null +++ b/lib/instance-manager.js @@ -0,0 +1,376 @@ +// instance-manager.js — Manage autoresearch instances +// +// Each instance is a real autoresearch setup: a coding agent (Claude, Codex, etc.) +// running program.md against a git branch. The manager handles: +// - Setting up workspaces (clone repo, prepare data, create branches) +// - Generating program.md files (the executive's main control surface) +// - Spawning coding agents pointed at program.md +// - Monitoring results.tsv for progress +// - Detecting plateaus and reporting to the executive +// - Cross-pollinating insights between instances + +const path = require('path'); +const fs = require('fs'); +const { spawn, execSync } = require('child_process'); +const EventEmitter = require('events'); +const { + setupWorkspace, + setupGeneralizedWorkspace, + generateProgramMd, + parseResults, + getBestResult, + detectPlateau, +} = require('./autoresearch'); + +class InstanceManager extends EventEmitter { + constructor(hub, options = {}) { + super(); + this.hub = hub; + this.instances = new Map(); // id → instance state + this.maxConcurrent = options.maxConcurrent || 4; + this.workDir = options.workDir || process.cwd(); + this.agentCommand = options.agentCommand || null; // e.g. 'claude-code' or 'codex' + this._pollInterval = null; + } + + // ── Spawn an autoresearch instance ───────────────────────────────────── + + async spawn(track, context = {}) { + if (this.instances.size >= this.maxConcurrent) { + console.log(` [INSTANCES] At capacity (${this.maxConcurrent}). Queue ${track.id}.`); + return null; + } + + if (this.instances.has(track.id)) { + console.log(` [INSTANCES] ${track.id} already running.`); + return null; + } + + // Set up the autoresearch workspace + let workspace; + if (track.ml !== false && !track.targetFile) { + // Standard ML autoresearch (optimize train.py against val_bpb) + workspace = setupWorkspace({ + workDir: path.join(this.workDir, '.executive', track.id), + branch: track.baseBranch, + tag: track.id, + skipPrepare: track.skipPrepare, + }); + } else { + // Generalized autoresearch (any file, any metric) + workspace = setupGeneralizedWorkspace({ + workDir: track.workDir || this.workDir, + targetFile: track.targetFile, + evalCommand: track.evalCommand || track.experimentCommand, + metric: track.metric || 'score', + metricDirection: track.metricDirection || 'higher', + tag: track.id, + }); + } + + // Generate the program.md (the executive's control surface) + const programContent = await generateProgramMd(track, workspace, context); + fs.writeFileSync(workspace.programMd, programContent); + + // Register in hub + this.hub.registerInstance(track.id, { + id: track.id, + name: track.name, + description: track.description, + metric: workspace.metric || 'val_bpb', + metricDirection: workspace.metricDirection || 'lower', + relatedGoals: track.relatedGoals || [], + priority: track.priority || 'medium', + workspacePath: workspace.repoDir, + resultsTsvPath: workspace.resultsTsv, + programMdPath: workspace.programMd, + targetFile: workspace.targetFile, + }); + + // Spawn the coding agent + const proc = this._spawnAgent(track, workspace); + + const instance = { + process: proc, + track, + workspace, + startedAt: Date.now(), + lastResultCount: 0, + bestMetric: null, + }; + + this.instances.set(track.id, instance); + this.hub.updateInstanceStatus(track.id, 'running'); + + console.log(` [INSTANCES] Spawned: ${track.id}`); + console.log(` Workspace: ${workspace.repoDir}`); + console.log(` Program: ${workspace.programMd}`); + console.log(` Results: ${workspace.resultsTsv}`); + + // Start polling results.tsv for this instance + this._startPolling(); + + return track.id; + } + + // ── Spawn the actual coding agent ────────────────────────────────────── + + _spawnAgent(track, workspace) { + const agentCmd = track.agentCommand || this.agentCommand; + + if (!agentCmd) { + // No agent command configured — write a helper script the user can + // point their coding agent at manually + const helperPath = path.join(workspace.stateDir || workspace.repoDir, 'start-agent.sh'); + fs.writeFileSync(helperPath, `#!/bin/bash +# Point your coding agent at program.md in this workspace: +# +# claude-code "read ${workspace.programMd} and follow the instructions" +# # or +# codex "read ${workspace.programMd} and follow the instructions" +# +# The executive will monitor results.tsv for progress. +echo "Workspace: ${workspace.repoDir}" +echo "Program: ${workspace.programMd}" +echo "Results: ${workspace.resultsTsv}" +echo "" +echo "Start your coding agent and point it at program.md" +`, { mode: 0o755 }); + + console.log(` [INSTANCES] No agent command configured. Run manually:`); + console.log(` ${helperPath}`); + return null; + } + + // Spawn the coding agent as a child process + // The agent reads program.md and runs the experiment loop + let proc; + const prompt = `Read the file ${workspace.programMd} and follow the instructions exactly. This is an autoresearch experiment loop — run it until interrupted.`; + + if (agentCmd.includes('claude')) { + proc = spawn(agentCmd, ['--print', '-p', prompt], { + cwd: workspace.repoDir, + env: { ...process.env }, + stdio: ['pipe', 'pipe', 'pipe'], + }); + } else if (agentCmd.includes('codex')) { + proc = spawn(agentCmd, [prompt], { + cwd: workspace.repoDir, + env: { ...process.env }, + stdio: ['pipe', 'pipe', 'pipe'], + }); + } else { + // Generic: pass prompt as first arg + proc = spawn(agentCmd, [prompt], { + cwd: workspace.repoDir, + env: { ...process.env }, + stdio: ['pipe', 'pipe', 'pipe'], + shell: true, + }); + } + + proc.on('exit', (code) => { + const instance = this.instances.get(track.id); + const results = parseResults(workspace.resultsTsv); + this.instances.delete(track.id); + this.hub.updateInstanceStatus(track.id, code === 0 ? 'completed' : 'failed', { + exitCode: code, + experiments: results.length, + bestMetric: instance?.bestMetric, + runtime: instance ? Math.round((Date.now() - instance.startedAt) / 1000) : 0, + }); + this.emit('exit', { instanceId: track.id, code, experiments: results.length }); + console.log(` [INSTANCES] ${track.id} exited (code ${code}, ${results.length} experiments)`); + }); + + if (proc.stdout) { + proc.stdout.on('data', (data) => { + // Log agent output to hub + const text = data.toString().trim(); + if (text) { + this.hub.logResult(track.id, { agentOutput: text.substring(0, 500) }); + } + }); + } + + return proc; + } + + // ── Monitor results.tsv ──────────────────────────────────────────────── + // The executive's "eyes" — polls each instance's results.tsv to detect + // new experiments, improvements, and plateaus. + + _startPolling() { + if (this._pollInterval) return; + this._pollInterval = setInterval(() => this._pollResults(), 15000); // every 15s + } + + _stopPolling() { + if (this._pollInterval) { + clearInterval(this._pollInterval); + this._pollInterval = null; + } + } + + _pollResults() { + for (const [id, instance] of this.instances) { + const results = parseResults(instance.workspace.resultsTsv); + const newCount = results.length; + + if (newCount > instance.lastResultCount) { + const newResults = results.slice(instance.lastResultCount); + instance.lastResultCount = newCount; + + for (const result of newResults) { + // Log to hub + this.hub.logResult(id, result); + this.hub.post('results', { + instanceId: id, + experiment: newCount, + ...result, + }); + + // Track best metric + const metricKey = instance.workspace.metric || 'val_bpb'; + const metricVal = result[metricKey]; + if (metricVal != null && !isNaN(metricVal)) { + const direction = instance.workspace.metricDirection || 'lower'; + if (instance.bestMetric === null || + (direction === 'lower' ? metricVal < instance.bestMetric : metricVal > instance.bestMetric)) { + instance.bestMetric = metricVal; + this.emit('improvement', { instanceId: id, metric: metricVal, result }); + } + } + + this.emit('result', { instanceId: id, result }); + } + } + + // Check for plateau + const plateau = detectPlateau(instance.workspace.resultsTsv); + if (plateau.plateaued) { + this.emit('plateau', { instanceId: id, ...plateau }); + } + } + } + + // ── Steer an instance (rewrite its program.md) ───────────────────────── + + async steer(instanceId, newDirective, context = {}) { + const instance = this.instances.get(instanceId); + if (!instance) return false; + + console.log(` [INSTANCES] Steering ${instanceId}: ${newDirective.substring(0, 100)}`); + + // Update the program.md with new focus + const track = { + ...instance.track, + approach: newDirective, + }; + const programContent = await generateProgramMd(track, instance.workspace, context); + fs.writeFileSync(instance.workspace.programMd, programContent); + + // Post to hub discussion so the agent can pick up the change + this.hub.post('discussion', { + author: 'executive', + to: instanceId, + text: `REDIRECT: ${newDirective}`, + }); + + this.hub.sendDirective(instanceId, { + action: 'redirect', + approach: newDirective, + }); + + return true; + } + + // ── Kill an instance ─────────────────────────────────────────────────── + + kill(instanceId, reason = 'executive decision') { + const instance = this.instances.get(instanceId); + if (!instance) return false; + + console.log(` [INSTANCES] Killing ${instanceId}: ${reason}`); + + this.hub.post('executive', { + action: 'kill', + instanceId, + reason, + }); + + if (instance.process) { + instance.process.kill('SIGTERM'); + setTimeout(() => { + if (this.instances.has(instanceId)) { + instance.process?.kill('SIGKILL'); + this.instances.delete(instanceId); + } + }, 5000); + } else { + this.instances.delete(instanceId); + } + + this.hub.updateInstanceStatus(instanceId, 'killed', { reason }); + return true; + } + + killAll(reason = 'executive shutdown') { + for (const [id] of this.instances) { + this.kill(id, reason); + } + this._stopPolling(); + } + + // ── Status snapshot ──────────────────────────────────────────────────── + + status() { + const instances = []; + for (const [id, inst] of this.instances) { + const results = parseResults(inst.workspace.resultsTsv); + const plateau = detectPlateau(inst.workspace.resultsTsv); + + instances.push({ + id, + name: inst.track.name, + experiments: results.length, + bestMetric: inst.bestMetric, + plateau: plateau.plateaued, + keepRate: plateau.keepRate, + runtimeMinutes: Math.round((Date.now() - inst.startedAt) / 60000), + priority: inst.track.priority, + }); + } + + return { + running: this.instances.size, + capacity: this.maxConcurrent, + instances, + }; + } + + // ── Cross-pollinate insights between instances ───────────────────────── + + getCrossInstanceInsights() { + const insights = []; + + for (const [id, inst] of this.instances) { + const best = getBestResult( + inst.workspace.resultsTsv, + inst.workspace.metric || 'val_bpb', + inst.workspace.metricDirection || 'lower' + ); + + if (best && best.description) { + insights.push({ + source: id, + insight: `Best approach so far: ${best.description} (${inst.workspace.metric || 'val_bpb'}: ${best[inst.workspace.metric || 'val_bpb']})`, + }); + } + } + + return insights; + } +} + +module.exports = { InstanceManager }; diff --git a/lib/mission.js b/lib/mission.js new file mode 100644 index 0000000..6c81c9f --- /dev/null +++ b/lib/mission.js @@ -0,0 +1,190 @@ +// mission.js — Goal decomposition and mission planning +// +// The executive's first job: translate business goals into research tracks. +// Each track becomes an autoresearch-style instance running its own loop. + +const { generateText } = require('./ai'); + +/** + * Decompose high-level business goals into concrete research tracks. + * + * @param {object} config — executive config + * @param {Array} config.goals — business goals with metrics + * @param {Array} [config.tracks] — pre-defined tracks (optional) + * @param {object} [hubSnapshot] — current hub state for context + * @returns {Array} research tracks to spawn + */ +async function decomposeGoals(config, hubSnapshot = null) { + // If tracks are pre-defined, use them directly + if (config.tracks && config.tracks.length > 0) { + return config.tracks.map(track => ({ + ...track, + goals: findRelatedGoals(track, config.goals), + status: 'pending', + })); + } + + // Otherwise, use LLM to decompose goals into tracks + const goalsDesc = config.goals.map(g => + `- ${g.id}: ${g.description || g.metric} (current: ${g.current}, target: ${g.target}, weight: ${g.weight})` + ).join('\n'); + + const contextBlock = hubSnapshot + ? `\nCURRENT STATE:\n${JSON.stringify(hubSnapshot.instances?.map(i => ({ + id: i.id, status: i.status, recentResults: i.recentResults?.slice(-3), + })), null, 2)}` + : ''; + + const prompt = `You are a research director planning parallel optimization tracks for a business. + +BUSINESS GOALS: +${goalsDesc} + +DOMAIN: ${config.name || 'Unknown'} +${config.domainContext ? `CONTEXT: ${config.domainContext}` : ''} +${contextBlock} + +CONSTRAINTS: +- Max ${config.maxConcurrentInstances || 4} parallel tracks +- Each track must have a clear, measurable metric +- Each track should be INDEPENDENT enough to run in parallel +- Prefer tracks that can make progress in ${config.experimentBudgetMinutes || 5} minute increments + +Decompose these goals into concrete research tracks. Each track will get its own autoresearch-style loop (modify code → run experiment → measure → keep/discard). + +Return JSON: +{ + "rationale": "1-2 sentences on decomposition strategy", + "tracks": [ + { + "id": "short-slug", + "name": "Human-readable name", + "description": "What this track optimizes", + "metric": "What to measure (lower/higher is better)", + "metricDirection": "lower" | "higher", + "relatedGoals": ["goal-id-1"], + "approach": "How the autoresearch loop should approach this", + "files": ["which files to modify"], + "experimentCommand": "command to run each experiment", + "priority": "critical" | "high" | "medium" + } + ] +}`; + + const raw = await generateText( + 'You are a research director who decomposes business goals into parallel optimization tracks. Return ONLY valid JSON.', + prompt, + { temperature: 0.4, json: true, maxTokens: 4000 } + ); + + try { + const parsed = JSON.parse(raw); + return (parsed.tracks || []).map(track => ({ + ...track, + type: 'autoresearch', + status: 'pending', + })); + } catch { + console.error('Failed to parse mission decomposition'); + return []; + } +} + +/** + * Re-decompose when all tracks have plateaued. Generates fresh approaches. + */ +async function redecompose(config, hubSnapshot, previousTracks) { + const prevDesc = previousTracks.map(t => { + const results = hubSnapshot?.instances?.find(i => i.id === t.id)?.recentResults || []; + const best = results.length > 0 + ? Math.min(...results.map(r => r.metric || Infinity)) + : 'no results'; + return `- ${t.id}: ${t.description} → best metric: ${best}, status: ${t.status}`; + }).join('\n'); + + const prompt = `You are a research director. ALL current research tracks have plateaued. + +BUSINESS GOALS: +${config.goals.map(g => `- ${g.id}: ${g.description || g.metric} (current: ${g.current}, target: ${g.target})`).join('\n')} + +PLATEAUED TRACKS: +${prevDesc} + +HUB DISCUSSION (recent insights): +${(hubSnapshot?.channels?.discussion || []).slice(-10).map(m => `- [${m.author}] ${m.text}`).join('\n') || 'None'} + +Generate ENTIRELY NEW approaches. Do NOT repeat what was tried before. Think orthogonally: +- Different algorithmic approaches +- Different problem framings +- Combining insights from multiple tracks +- Attacking the problem from a different angle + +Return JSON with same schema as before (tracks array with id, name, description, metric, etc.)`; + + const raw = await generateText( + 'You are a research director generating fresh approaches after previous ones plateaued. Return ONLY valid JSON.', + prompt, + { temperature: 0.7, json: true, maxTokens: 4000 } + ); + + try { + const parsed = JSON.parse(raw); + return (parsed.tracks || []).map(track => ({ + ...track, + type: 'autoresearch', + status: 'pending', + generation: (previousTracks[0]?.generation || 1) + 1, + })); + } catch { + return []; + } +} + +/** + * Score how well current instance results map to business goals. + */ +function scoreGoalProgress(goals, hubSnapshot) { + return goals.map(goal => { + const relatedInstances = (hubSnapshot?.instances || []).filter(inst => + inst.relatedGoals?.includes(goal.id) || inst.id.includes(goal.id) + ); + + // Find best metric across related instances + let bestMetric = goal.current; + for (const inst of relatedInstances) { + const results = inst.recentResults || []; + for (const r of results) { + if (r.metric != null) { + if (goal.direction === 'lower') { + bestMetric = Math.min(bestMetric, r.metric); + } else { + bestMetric = Math.max(bestMetric, r.metric); + } + } + } + } + + const range = Math.abs(goal.target - goal.current); + const progress = range > 0 + ? Math.min(1, Math.abs(bestMetric - goal.current) / range) + : 0; + + return { + goalId: goal.id, + current: bestMetric, + target: goal.target, + progress: parseFloat(progress.toFixed(3)), + onTrack: progress >= 0.3, + relatedInstances: relatedInstances.map(i => i.id), + }; + }); +} + +function findRelatedGoals(track, goals) { + if (track.relatedGoals) return track.relatedGoals; + return goals + .filter(g => track.id.includes(g.id) || track.description?.includes(g.id)) + .map(g => g.id); +} + +module.exports = { decomposeGoals, redecompose, scoreGoalProgress }; diff --git a/package.json b/package.json index f7bb18c..38473d8 100644 --- a/package.json +++ b/package.json @@ -1,25 +1,31 @@ { "name": "ooda-engine", - "version": "1.0.0", - "description": "Self-improving AI engine — autonomous code improvement via multi-agent critique, voting, and verified source edits", + "version": "2.0.0", + "description": "Executive AI harness built on autoresearch — orchestrates parallel optimization loops toward business goals", "main": "ooda.js", "bin": { - "ooda": "./ooda.js" + "ooda": "./ooda.js", + "ooda-executive": "./executive.js" }, "scripts": { "start": "node ooda.js", + "executive": "node executive.js", + "executive:dry-run": "node executive.js --dry-run", + "executive:status": "node executive.js --status", "dry-run": "node ooda.js --dry-run", "status": "node ooda.js --status", "reset": "node ooda.js --reset" }, "keywords": [ "ai", - "self-improving", + "autoresearch", "ooda-loop", "multi-agent", "autonomous", - "code-generation", - "quality-improvement", + "orchestration", + "research-automation", + "portfolio-optimization", + "executive-function", "gemini", "openai", "llm"