Bbasche · Bbasche · Mar 10, 2026
diff --git a/.env.example b/.env.example
@@ -9,3 +9,12 @@ GEMINI_API_KEY=
 
 # Max iterations before stopping (default: 100)
 # OODA_MAX_ITERATIONS=100
+
+# ── Executive Harness ────────────────────────────────────────────────────
+# Coding agent CLI for spawning autoresearch instances
+# Options: 'claude-code', 'codex', or null (manual mode)
+# AGENT_COMMAND=claude-code
+
+# Autoresearch Hub URL (for agenthub multi-agent mode)
+# AUTORESEARCH_HUB_URL=https://autoresearchhub.com
+# AUTORESEARCH_API_KEY=
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,10 @@
 node_modules/
 .env
 _ooda/
+.executive/
 *.db
 *.db-journal
 .DS_Store
+autoresearch/
+results.tsv
+run.log
diff --git a/README.md b/README.md
diff --git a/examples/executive.config.js b/examples/executive.config.js
@@ -0,0 +1,115 @@
+// executive.config.js — Example executive harness configuration
+//
+// This example shows how to orchestrate multiple autoresearch instances
+// toward business goals. The executive decomposes goals into research
+// tracks, spawns autoresearch agents, and manages the portfolio.
+
+module.exports = {
+  // ── Identity ──────────────────────────────────────────────────────────
+  name: 'LLM Training Optimization',
+  domainContext: 'Optimizing a small GPT model across multiple dimensions simultaneously',
+
+  // ── Business goals ────────────────────────────────────────────────────
+  // The executive translates these into research tracks.
+  // Each goal has a metric, current value, target, and weight.
+  goals: [
+    {
+      id: 'model-quality',
+      description: 'Improve model quality (val_bpb)',
+      metric: 'val_bpb',
+      current: 0.998,
+      target: 0.950,
+      weight: 0.5,
+      direction: 'lower',
+    },
+    {
+      id: 'memory-efficiency',
+      description: 'Reduce GPU memory usage',
+      metric: 'peak_vram_gb',
+      current: 12.0,
+      target: 8.0,
+      weight: 0.3,
+      direction: 'lower',
+    },
+    {
+      id: 'training-speed',
+      description: 'Maximize tokens processed per second',
+      metric: 'mfu_percent',
+      current: 30,
+      target: 50,
+      weight: 0.2,
+      direction: 'higher',
+    },
+  ],
+
+  // ── Pre-defined research tracks (optional) ────────────────────────────
+  // If omitted, the executive uses LLM to decompose goals into tracks.
+  // Pre-defining gives you explicit control over what gets researched.
+  tracks: [
+    {
+      id: 'arch-depth',
+      name: 'Architecture Depth Exploration',
+      description: 'Find optimal model depth vs width tradeoff for val_bpb',
+      approach: 'Systematically vary num_layers and d_model in train.py. Try deeper-narrower and shallower-wider configurations.',
+      metric: 'val_bpb',
+      metricDirection: 'lower',
+      relatedGoals: ['model-quality'],
+      priority: 'critical',
+      // These are autoresearch-native: the agent modifies train.py
+      ml: true,
+    },
+    {
+      id: 'optimizer-tuning',
+      name: 'Optimizer & LR Schedule',
+      description: 'Optimize learning rate schedule, warmup, and optimizer hyperparameters',
+      approach: 'Explore Muon vs AdamW settings, warmup schedules, weight decay on different parameter groups.',
+      metric: 'val_bpb',
+      metricDirection: 'lower',
+      relatedGoals: ['model-quality', 'training-speed'],
+      priority: 'high',
+      ml: true,
+    },
+    {
+      id: 'memory-opts',
+      name: 'Memory Optimization',
+      description: 'Reduce peak VRAM without hurting val_bpb',
+      approach: 'Try gradient checkpointing, mixed precision tweaks, batch size reduction with gradient accumulation.',
+      metric: 'peak_vram_mb',
+      metricDirection: 'lower',
+      relatedGoals: ['memory-efficiency'],
+      priority: 'high',
+      ml: true,
+    },
+    {
+      id: 'throughput',
+      name: 'Training Throughput',
+      description: 'Maximize MFU (model FLOPs utilization)',
+      approach: 'Optimize data loading, compile settings, kernel fusion, batch sizing for hardware utilization.',
+      metric: 'mfu_percent',
+      metricDirection: 'higher',
+      relatedGoals: ['training-speed'],
+      priority: 'medium',
+      ml: true,
+    },
+  ],
+
+  // ── Coding agent command ──────────────────────────────────────────────
+  // Which CLI to use for spawning autoresearch agents.
+  // Each instance gets its own program.md and git branch.
+  //
+  // Options:
+  //   'claude-code'     — Anthropic's Claude Code CLI
+  //   'codex'           — OpenAI's Codex CLI
+  //   null              — Manual mode (prints instructions, you run agents yourself)
+  agentCommand: null, // Set to 'claude-code' or 'codex' for automatic agent spawning
+
+  // ── Executive settings ────────────────────────────────────────────────
+  maxConcurrentInstances: 4,    // How many autoresearch agents run simultaneously
+  intervalMinutes: 15,          // How often the executive reviews the portfolio
+  maxCycles: 50,                // Max executive review cycles
+  experimentBudgetMinutes: 5,   // Per-experiment time budget (autoresearch default)
+
+  // ── Paths ─────────────────────────────────────────────────────────────
+  // workDir: '/path/to/workspace',  // Base directory for autoresearch clones
+  // hubDir: './.executive/_hub',    // Where the hub stores state
+};
diff --git a/examples/executive.generalized.config.js b/examples/executive.generalized.config.js
@@ -0,0 +1,87 @@
+// executive.generalized.config.js — Example: Non-ML autoresearch
+//
+// Shows how the executive harness generalizes autoresearch beyond ML.
+// The autoresearch PROTOCOL (modify file → run → measure → keep/discard)
+// works for any optimization problem. The executive orchestrates multiple
+// such loops toward business goals.
+
+module.exports = {
+  name: 'Full-Stack Product Optimization',
+  domainContext: 'Optimizing a SaaS product across performance, UX, and conversion',
+
+  goals: [
+    {
+      id: 'api-latency',
+      description: 'Reduce API p95 latency',
+      metric: 'p95_ms',
+      current: 450,
+      target: 100,
+      weight: 0.3,
+      direction: 'lower',
+    },
+    {
+      id: 'lighthouse',
+      description: 'Maximize Lighthouse performance score',
+      metric: 'lighthouse_score',
+      current: 62,
+      target: 95,
+      weight: 0.3,
+      direction: 'higher',
+    },
+    {
+      id: 'conversion',
+      description: 'Improve signup conversion rate',
+      metric: 'conversion_pct',
+      current: 2.1,
+      target: 5.0,
+      weight: 0.4,
+      direction: 'higher',
+    },
+  ],
+
+  tracks: [
+    {
+      id: 'api-perf',
+      name: 'API Performance',
+      description: 'Optimize API response times by modifying server configuration and query patterns',
+      // Generalized autoresearch: specify the target file and eval command
+      targetFile: 'src/server/config.ts',
+      evalCommand: 'npm run bench:api | grep p95',
+      metric: 'p95_ms',
+      metricDirection: 'lower',
+      relatedGoals: ['api-latency'],
+      priority: 'high',
+      constraints: [
+        'Do not change the API contract (request/response shapes)',
+        'Keep all existing tests passing',
+      ],
+    },
+    {
+      id: 'frontend-perf',
+      name: 'Frontend Performance',
+      description: 'Improve Lighthouse score by optimizing bundle size, rendering, and asset loading',
+      targetFile: 'src/app/layout.tsx',
+      evalCommand: 'npm run lighthouse -- --output=json | node -e "const r=JSON.parse(require(\'fs\').readFileSync(\'/dev/stdin\',\'utf8\')); console.log(\'lighthouse_score:\', r.categories.performance.score * 100)"',
+      metric: 'lighthouse_score',
+      metricDirection: 'higher',
+      relatedGoals: ['lighthouse'],
+      priority: 'high',
+    },
+    {
+      id: 'landing-copy',
+      name: 'Landing Page Copy',
+      description: 'Optimize landing page copy and CTA placement for conversion',
+      targetFile: 'src/app/page.tsx',
+      evalCommand: 'npm run test:conversion-proxy',
+      metric: 'conversion_pct',
+      metricDirection: 'higher',
+      relatedGoals: ['conversion'],
+      priority: 'critical',
+    },
+  ],
+
+  agentCommand: null,
+  maxConcurrentInstances: 3,
+  intervalMinutes: 20,
+  maxCycles: 30,
+};