diff --git a/README.md b/README.md
index a3fed4b..2c5164f 100644
--- a/README.md
+++ b/README.md
@@ -825,9 +825,112 @@ uv run python -m openadapt_ml.benchmarks.cli vm monitor --mock
 uv run python -m openadapt_ml.benchmarks.cli vm monitor --auto-shutdown-hours 2
 ```
 
+### 13.5 Benchmark Execution Logs
+
+View benchmark execution progress and logs:
+
+```bash
+# View WAA container status and Docker logs
+uv run python -m openadapt_ml.benchmarks.cli logs
+
+# View WAA benchmark execution logs (task progress, agent actions)
+uv run python -m openadapt_ml.benchmarks.cli logs --run
+
+# Stream execution logs live
+uv run python -m openadapt_ml.benchmarks.cli logs --run -f
+
+# Show last N lines of execution logs
+uv run python -m openadapt_ml.benchmarks.cli logs --run --tail 100
+
+# Show benchmark progress and ETA
+uv run python -m openadapt_ml.benchmarks.cli logs --progress
+```
+
+**Example: Container status (`logs`)**
+```
+WAA Status (20.12.180.208)
+============================================================
+
+[Docker Images]
+REPOSITORY              TAG       SIZE
+waa-auto                latest    25.4GB
+windowsarena/winarena   latest    25.8GB
+
+[Container]
+  Status: Up 49 minutes
+
+[Storage]
+  Total: 21G
+  Disk image: 64G
+
+[QEMU VM]
+  Status: Running (PID 1471)
+  CPU: 176%, MEM: 51.6%, Uptime: 47:28
+
+[WAA Server]
+  "status": "Probe successful"
+ (READY)
+```
+
+**Example: Benchmark execution logs (`logs --run -f`)**
+```
+Run log: /home/azureuser/cli_logs/run_20260128_175507.log
+------------------------------------------------------------
+Streaming log (Ctrl+C to stop)...
+
+[2026-01-28 23:05:10,303 INFO agent/401-MainProcess] Thinking...
+[2026-01-28 23:05:17,318 INFO python/62-MainProcess] Updated computer successfully
+[2026-01-28 23:05:17,318 INFO lib_run_single/56-MainProcess] Step 9: computer.window_manager.switch_to_application("Summer Trip - File Explorer")
+```
+
+**Example: Benchmark progress (`logs --progress`)**
+```
+=== WAA Benchmark Progress ===
+
+Log: /home/azureuser/cli_logs/run_20260128_175507.log
+Started: 2026-01-28 22:55:14
+Latest:  2026-01-28 23:28:37
+
+Tasks completed: 1 / 154
+Elapsed: 33 minutes
+
+Avg time per task: ~33 min
+Remaining tasks: 153
+Estimated remaining: ~84h 9m
+
+Progress: 0% [1/154]
+```
+
+**Other useful commands:**
+```bash
+# Check WAA server status (probe endpoint)
+uv run python -m openadapt_ml.benchmarks.cli probe
+
+# Check VM/Azure status
+uv run python -m openadapt_ml.benchmarks.cli status
+
+# Download benchmark results from VM
+uv run python -m openadapt_ml.benchmarks.cli download
+
+# Analyze downloaded results
+uv run python -m openadapt_ml.benchmarks.cli analyze
+```
+
+**Running benchmarks:**
+```bash
+# Run full benchmark (154 tasks)
+uv run python -m openadapt_ml.benchmarks.cli run --num-tasks 154
+
+# Run specific domain
+uv run python -m openadapt_ml.benchmarks.cli run --domain notepad --num-tasks 5
+
+# Run single task
+uv run python -m openadapt_ml.benchmarks.cli run --task notepad_1
+```
+
 For complete VM management commands and Azure setup instructions, see [`CLAUDE.md`](CLAUDE.md) and [`docs/azure_waa_setup.md`](docs/azure_waa_setup.md).
 
-### 13.5 Screenshot Capture Tool
+### 13.6 Screenshot Capture Tool
 
 Capture screenshots of dashboards and VMs for documentation and PR purposes:
 
diff --git a/docs/REPO_CONSOLIDATION_PLAN.md b/docs/REPO_CONSOLIDATION_PLAN.md
new file mode 100644
index 0000000..90af320
--- /dev/null
+++ b/docs/REPO_CONSOLIDATION_PLAN.md
@@ -0,0 +1,1076 @@
+# Repository Consolidation Plan
+
+## Overview
+
+Clean up the existing **two-package architecture** by moving code to the right place:
+
+```
+openadapt-evals       # Foundation: benchmarks + infrastructure (standalone)
+    └── MOVE HERE: VM management, waa_deploy/, session tracking
+    └── Zero ML dependencies
+    └── Supports multiple benchmarks (WAA, OSWorld, WebArena, etc.)
+
+openadapt-ml          # Extension: ML training
+    └── KEEP: training/, vlm/, baselines/, grounding/
+    └── ADD DEPENDENCY: openadapt-evals
+    └── DELETE: duplicate benchmark code
+```
+
+**What this consolidation does:**
+1. Moves VM/benchmark infrastructure from openadapt-ml → openadapt-evals
+2. Deletes ~1000 lines of duplicate code between repos
+3. Establishes proper dependency: openadapt-ml depends on openadapt-evals
+4. Cleans up ~1500 lines of dead code (server patches never deployed)
+
+**Why `openadapt-evals` not `openadapt-waa`?**
+- Avoids repo proliferation (no need for openadapt-osworld, openadapt-webarena, etc.)
+- Single package supports all benchmarks with shared infrastructure
+- Discoverability via README, PyPI keywords, GitHub topics instead of package name
+
+---
+
+## Part 0: Current State (as of Jan 2026)
+
+### openadapt-ml Current Structure
+
+```
+openadapt_ml/
+├── benchmarks/                    # VM + Benchmark code (mostly from PR #14)
+│   ├── cli.py                     # ⭐ PR #14: VM lifecycle CLI (~1300 lines)
+│   │                              #    Commands: create, delete, status, build,
+│   │                              #    start, stop, probe, run, deallocate,
+│   │                              #    logs, exec, docker-exec, vnc, tasks, etc.
+│   ├── waa_deploy/                # ⭐ PR #14: Docker deployment
+│   │   ├── Dockerfile             #    Custom WAA image build
+│   │   ├── api_agent.py           #    Agent running inside container
+│   │   ├── install.bat            #    Windows setup script
+│   │   └── start_waa_server.bat   #    Server startup script
+│   ├── vm_monitor.py              # VM status monitoring
+│   ├── azure_ops_tracker.py       # Azure operation logging
+│   ├── session_tracker.py         # Cost/time tracking
+│   ├── disk_manager.py            # Disk space management
+│   ├── dashboard.py               # Dashboard generation
+│   ├── viewer.py                  # Benchmark results viewer
+│   │
+│   ├── # --- Duplicates (also in openadapt-evals) ---
+│   ├── agent.py                   # → DELETE (use openadapt-evals)
+│   ├── base.py                    # → DELETE (use openadapt-evals)
+│   ├── runner.py                  # → DELETE (use openadapt-evals)
+│   ├── waa.py                     # → DELETE (use openadapt-evals)
+│   ├── waa_live.py                # → DELETE (use openadapt-evals)
+│   ├── data_collection.py         # → DELETE (use openadapt-evals)
+│   ├── live_tracker.py            # → DELETE (use openadapt-evals)
+│   ├── azure.py                   # → DELETE (use openadapt-evals)
+│   └── trace_export.py            # → DELETE (use openadapt-evals)
+│
+├── cloud/                         # Cloud infrastructure
+│   ├── local.py                   # Dashboard server (~3700 lines, 90% benchmark)
+│   ├── ssh_tunnel.py              # SSH tunnel management
+│   ├── lambda_labs.py             # Lambda Labs GPU training
+│   └── azure_inference.py         # Azure ML inference
+│
+├── training/                      # ML Training (KEEP in openadapt-ml)
+│   ├── trainer.py                 # Core trainer
+│   ├── trl_trainer.py             # TRL-based trainer
+│   ├── stub_provider.py           # Mock training for testing
+│   ├── benchmark_viewer.py        # Training benchmark viewer
+│   ├── azure_ops_viewer.py        # Azure ops viewer
+│   ├── shared_ui.py               # Shared UI components
+│   ├── viewer.py                  # Training viewer
+│   └── viewer_components.py       # Viewer components
+│
+├── models/                        # VLM Adapters (KEEP in openadapt-ml)
+│   ├── api_adapter.py             # API-based VLM
+│   ├── base_adapter.py            # Base adapter interface
+│   ├── qwen_vl.py                 # Qwen adapter
+│   ├── dummy_adapter.py           # Testing
+│   └── providers/                 # Provider implementations
+│       ├── anthropic.py
+│       ├── openai.py
+│       └── google.py
+│
+├── baselines/                     # Baseline adapters (KEEP in openadapt-ml)
+│   ├── adapter.py
+│   ├── cli.py
+│   ├── config.py
+│   ├── parser.py
+│   └── prompts.py
+│
+├── grounding/                     # UI grounding (KEEP in openadapt-ml)
+│   ├── base.py
+│   └── detector.py
+│
+├── ingest/                        # Data ingestion (KEEP in openadapt-ml)
+│   ├── capture.py                 # OpenAdapt capture ingestion
+│   ├── loader.py
+│   └── synthetic.py
+│
+├── retrieval/                     # Demo retrieval (KEEP in openadapt-ml)
+│   ├── retriever.py
+│   ├── demo_retriever.py
+│   ├── embeddings.py
+│   └── index.py
+│
+├── experiments/                   # Research experiments (KEEP in openadapt-ml)
+│   ├── demo_prompt/               # Demo-conditioned prompting
+│   ├── representation_shootout/   # Representation experiments
+│   └── waa_demo/                  # WAA demo experiments
+│
+├── segmentation/                  # Workflow segmentation (KEEP in openadapt-ml)
+│   ├── cli.py
+│   ├── pipeline.py
+│   ├── annotator.py
+│   └── ...
+│
+├── runtime/                       # Runtime policy (KEEP in openadapt-ml)
+│   ├── policy.py
+│   └── safety_gate.py
+│
+├── schema/                        # Data schemas
+│   ├── episode.py                 # Episode schema
+│   └── converters.py
+│
+├── evals/                         # Evaluation metrics (KEEP in openadapt-ml)
+│   ├── grounding.py
+│   ├── trajectory_matching.py
+│   └── plot_eval_metrics.py
+│
+├── export/                        # Data export (KEEP in openadapt-ml)
+│   ├── cli.py
+│   └── parquet.py
+│
+├── scripts/                       # CLI scripts (KEEP in openadapt-ml)
+│   ├── train.py
+│   ├── compare.py
+│   ├── capture_screenshots.py
+│   └── ...
+│
+└── config.py                      # Configuration
+```
+
+### openadapt-evals Current Structure
+
+```
+openadapt_evals/
+├── adapters/                      # Benchmark adapters (KEEP in openadapt-evals)
+│   ├── base.py                    # BenchmarkAdapter interface
+│   ├── waa.py                     # WAAMockAdapter
+│   └── waa_live.py                # WAALiveAdapter
+│
+├── agents/                        # Benchmark agents (KEEP in openadapt-evals)
+│   ├── base.py                    # BenchmarkAgent interface
+│   ├── api_agent.py               # Claude/GPT API agent (P0 demo fix)
+│   ├── retrieval_agent.py         # Demo retrieval agent
+│   ├── scripted_agent.py          # Scripted agent for testing
+│   ├── baseline_agent.py          # → MOVE to openadapt-ml (uses VLM)
+│   └── policy_agent.py            # → MOVE to openadapt-ml (uses trained model)
+│
+├── benchmarks/                    # Benchmark framework (KEEP in openadapt-evals)
+│   ├── cli.py                     # Evaluation CLI
+│   ├── runner.py                  # evaluate_agent_on_benchmark()
+│   ├── data_collection.py         # ExecutionTraceCollector
+│   ├── live_tracker.py            # LiveEvaluationTracker
+│   ├── monitoring.py              # Benchmark monitoring
+│   ├── dashboard_server.py        # Dashboard HTTP server
+│   ├── viewer.py                  # Results viewer
+│   ├── config.py                  # Configuration
+│   ├── health_checker.py          # Health checking
+│   ├── auto_screenshot.py         # Screenshot automation
+│   ├── generate_synthetic_demos.py
+│   ├── validate_demos.py
+│   ├── validate_screenshots.py
+│   ├── agent.py                   # → Duplicate
+│   ├── base.py                    # → Duplicate
+│   ├── waa.py                     # → Duplicate
+│   ├── waa_live.py                # → Duplicate
+│   ├── azure.py                   # → Duplicate
+│   └── live_api.py
+│
+├── evaluation/                    # Evaluation framework (KEEP)
+│   ├── client.py                  # → REVIEW (may be dead code)
+│   └── discovery.py               # VM IP auto-discovery (KEEP)
+│
+├── server/                        # Server patches → DELETE (unused)
+│   ├── evaluate_endpoint.py       # → DELETE (never deployed)
+│   └── waa_server_patch.py        # → DELETE (never deployed)
+│
+├── shared_ui/                     # UI components (KEEP)
+│   └── keyboard_shortcuts.py
+│
+├── metrics/                       # Metrics (KEEP)
+│   └── __init__.py
+│
+└── tests/                         # Tests (KEEP)
+    ├── test_api_agent_p0_fix.py
+    ├── test_api_agent_parsing.py
+    ├── test_cost_optimization.py
+    ├── test_evaluate_endpoint.py
+    ├── test_mock_adapter.py
+    ├── test_retrieval_agent.py
+    ├── test_runner.py
+    └── test_synthetic_demos.py
+```
+
+### PR #14 Code Summary
+
+PR #14 (merged Jan 2026) added the VM management CLI to openadapt-ml:
+
+**Files Added/Modified:**
+- `openadapt_ml/benchmarks/cli.py` - ~1300 lines of VM lifecycle commands
+- `openadapt_ml/benchmarks/waa_deploy/Dockerfile` - Custom WAA Docker image
+- `openadapt_ml/benchmarks/waa_deploy/api_agent.py` - Agent inside container
+- `openadapt_ml/benchmarks/waa_deploy/install.bat` - Windows setup
+- `openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat` - Server startup
+
+**CLI Commands (from PR #14):**
+```
+create      - Create Azure VM with nested virtualization
+delete      - Delete VM and ALL associated resources
+status      - Show VM state and IP
+build       - Build WAA image from waa_deploy/Dockerfile
+start       - Start WAA container
+stop        - Stop container
+probe       - Check if WAA server is ready
+run         - Run benchmark tasks
+deallocate  - Stop VM (preserves disk, stops billing)
+logs        - Show WAA status and logs
+exec        - Run command in container
+docker-exec - Run docker command on host
+vnc         - Open VNC viewer
+tasks       - List available tasks
+download    - Download results
+analyze     - Analyze results
+```
+
+**Destination in openadapt-evals:**
+- `cli.py` → `openadapt_evals/cli/vm.py` (merge with existing evals CLI)
+- `waa_deploy/` → `openadapt_evals/waa_deploy/`
+
+---
+
+## Part 0.5: Code Audit Results (VERIFIED 2026-01-28)
+
+> **✅ VERIFIED**: These findings have been confirmed by comprehensive import analysis.
+
+### Audit Methodology
+
+Verified by:
+1. Checking all imports across the codebase (`grep -r "from.*module\|import.*module"`)
+2. Checking exports in `__init__.py` files and `__all__`
+3. Checking CLI command references
+4. Checking test file imports
+
+### Dead Code (VERIFIED - 10 files)
+
+| File | Status | Evidence |
+|------|--------|----------|
+| `benchmarks/agent.py` | ✅ DEAD (deprecated shim) | Deprecation warning, zero imports |
+| `benchmarks/base.py` | ✅ DEAD (deprecated shim) | Deprecation warning, zero imports |
+| `benchmarks/waa.py` | ✅ DEAD (deprecated shim) | Deprecation warning, zero imports |
+| `benchmarks/waa_live.py` | ✅ DEAD (deprecated shim) | Deprecation warning, zero imports |
+| `benchmarks/auto_screenshot.py` | ✅ DEAD | Zero imports, no CLI command |
+| `benchmarks/dashboard_server.py` | ✅ DEAD | Zero imports, no CLI command |
+| `benchmarks/generate_synthetic_demos.py` | ✅ DEAD | Zero imports, no CLI command |
+| `benchmarks/live_api.py` | ✅ DEAD | Zero imports, no CLI command |
+| `benchmarks/validate_demos.py` | ✅ DEAD | Zero imports, no CLI command |
+| `benchmarks/validate_screenshots.py` | ✅ DEAD | Zero imports, no CLI command |
+
+**Total: ~1000 lines of dead code to remove**
+
+### Previously Marked Dead But Actually Used (3 files)
+
+| File | Status | Evidence |
+|------|--------|----------|
+| `agents/baseline_agent.py` | ✅ USED | Lazy-exported in `agents/__init__.py` |
+| `server/waa_server_patch.py` | ✅ USED | Referenced in `scripts/patch_waa_evaluate.py` |
+| `server/evaluate_endpoint.py` | ✅ USED | Exported and tested (100+ tests) |
+
+### Agents Analysis (VERIFIED)
+
+Agents directory split based on ML dependencies:
+
+| Agent | ML Deps | Key Imports | Recommendation |
+|-------|---------|-------------|----------------|
+| `BenchmarkAgent` (base.py) | ❌ None | `abc`, `re`, `dataclasses` | Keep in openadapt-evals |
+| `ScriptedAgent`, `RandomAgent`, `SmartMockAgent` | ❌ None | `random` | Keep in openadapt-evals |
+| `ApiAgent` | ❌ None | `anthropic`, `openai` (API clients only) | Keep in openadapt-evals |
+| `RetrievalAugmentedAgent` | ⚠️ `openadapt_retrieval` | Embedding models | Keep w/ lazy load |
+| `PolicyAgent` | ✅ `openadapt_ml.vlm` | torch, transformers | **MOVE to openadapt-ml** |
+| `BaselineAgent` | ✅ `openadapt_ml.baselines` | torch, transformers | **MOVE to openadapt-ml** |
+
+**Key Insight**: `ApiAgent` does NOT need ML deps - it just wraps hosted API clients (Claude, GPT).
+
+### Duplicates Between Repos (7 file pairs)
+
+These files exist in both openadapt-ml and openadapt-evals:
+
+| openadapt_evals/ | openadapt_ml/benchmarks/ | Notes |
+|------------------|--------------------------|-------|
+| `adapters/base.py` | `base.py` | Core schemas |
+| `adapters/waa.py` | `waa.py` | WAA adapter |
+| `adapters/waa_live.py` | `waa_live.py` | Live adapter |
+| `benchmarks/runner.py` | `runner.py` | Eval loop |
+| `benchmarks/data_collection.py` | `data_collection.py` | Trace saving |
+| `benchmarks/live_tracker.py` | `live_tracker.py` | Progress tracking |
+| `benchmarks/azure.py` | `azure.py` | Azure orchestration |
+
+**Recommendation**: Pick one canonical location, delete the other, update imports.
+
+### Genuine Value-Add (TENTATIVE - 10 files)
+
+These files provide functionality not available elsewhere:
+
+| File | Value | Confidence |
+|------|-------|------------|
+| `agents/api_agent.py` | **P0 demo persistence fix** - critical | High |
+| `agents/retrieval_agent.py` | Demo retrieval feature | High |
+| `agents/scripted_agent.py` | Testing utilities (RandomAgent, SmartMockAgent) | High |
+| `evaluation/discovery.py` | VM IP auto-discovery from multiple sources | High |
+| `benchmarks/cli.py` | Evaluation-focused CLI | High |
+| `benchmarks/config.py` | Task loading utilities | High |
+| `benchmarks/runner.py` | Core evaluation loop | High |
+| `benchmarks/viewer.py` | Results viewer | High |
+| `benchmarks/health_checker.py` | Used by azure.py | Medium |
+| `benchmarks/monitoring.py` | Cost tracking (used by tests) | Medium |
+
+### Revised Migration Recommendation
+
+Based on this audit, the approach is **simpler than originally planned**:
+
+**openadapt-evals already exists** - we're consolidating INTO it, not creating a new repo.
+
+**Move FROM openadapt-ml TO openadapt-evals:**
+- `benchmarks/cli.py` (VM commands) → merge into `openadapt_evals/cli/`
+- `benchmarks/waa_deploy/` → `openadapt_evals/waa_deploy/`
+- `benchmarks/vm_monitor.py` → `openadapt_evals/infrastructure/`
+- `benchmarks/session_tracker.py` → `openadapt_evals/infrastructure/`
+- `cloud/ssh_tunnel.py` → `openadapt_evals/infrastructure/`
+
+**Delete FROM openadapt-evals (VERIFIED):**
+- Deprecated shims (4): `benchmarks/agent.py`, `benchmarks/base.py`, `benchmarks/waa.py`, `benchmarks/waa_live.py`
+- Dead code (6): `auto_screenshot.py`, `dashboard_server.py`, `generate_synthetic_demos.py`, `live_api.py`, `validate_demos.py`, `validate_screenshots.py`
+
+**KEEP in openadapt-evals (previously marked for deletion but actually used):**
+- `server/waa_server_patch.py` - used by `scripts/patch_waa_evaluate.py`
+- `server/evaluate_endpoint.py` - exported and tested
+- `agents/baseline_agent.py` - lazy-exported in public API
+
+**Delete FROM openadapt-ml (duplicates):**
+- `benchmarks/agent.py`, `base.py`, `runner.py`, `waa.py`, `waa_live.py`
+- `benchmarks/data_collection.py`, `live_tracker.py`, `azure.py`
+
+**Move FROM openadapt-evals TO openadapt-ml (FIXES circular dependency):**
+- `agents/policy_agent.py` - currently imports `openadapt_ml.vlm` (circular!)
+- `agents/baseline_agent.py` - currently imports `openadapt_ml.baselines` (circular!)
+- Moving them to openadapt-ml fixes the dependency direction:
+  - Before: evals → ml (wrong, creates circular dep)
+  - After: ml has the agents, depends on evals (correct)
+- Keep backward-compat lazy imports in openadapt-evals (optional, for API compat)
+
+**Keep in openadapt-evals (no ML deps):**
+- `agents/base.py` - abstract interface
+- `agents/api_agent.py` - just API clients (anthropic, openai)
+- `agents/scripted_agent.py` - test agents
+- `agents/retrieval_agent.py` - keep with lazy load for openadapt_retrieval
+
+---
+
+## Part 1: Architecture
+
+### Package Layering
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                      openadapt-ml                           │
+│  ┌─────────────────────────────────────────────────────┐   │
+│  │  Training    │  VLM Inference  │  Policy Agent      │   │
+│  │  Fine-tuning │  Qwen, etc.     │  Trained models    │   │
+│  └─────────────────────────────────────────────────────┘   │
+│                         │                                   │
+│                    depends on                               │
+│                         ▼                                   │
+├─────────────────────────────────────────────────────────────┤
+│                      openadapt-evals                        │
+│  ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐   │
+│  │ Adapters    │ │ Agents      │ │ Evaluation          │   │
+│  │ WAA, OS-    │ │ API (GPT,   │ │ Runner, metrics     │   │
+│  │ World, etc  │ │ Claude)     │ │ Data collection     │   │
+│  └─────────────┘ └─────────────┘ └─────────────────────┘   │
+│  ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐   │
+│  │ Infra       │ │ Dashboard   │ │ CLI                 │   │
+│  │ VM, Docker  │ │ Monitoring  │ │ evals command       │   │
+│  │ SSH, Azure  │ │ Viewers     │ │                     │   │
+│  └─────────────┘ └─────────────┘ └─────────────────────┘   │
+│  ┌─────────────────────────────────────────────────────┐   │
+│  │  Schemas    │  Config       │  Utilities            │   │
+│  └─────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### User Journeys
+
+**Journey 1: Benchmark Researcher (WAA, OSWorld, etc.)**
+```bash
+pip install openadapt-evals
+oa evals vm setup
+oa evals run --agent gpt-4o --tasks 10
+oa evals view --run my_eval
+oa evals vm stop
+```
+- No ML dependencies (no PyTorch, no transformers)
+- Lightweight install
+- Supports multiple benchmarks (WAA, OSWorld, WebArena, etc.)
+
+**Journey 2: ML Engineer (Training + Benchmarks)**
+```bash
+pip install openadapt-ml  # Also installs openadapt-evals
+oa ml train --capture /path/to/recording --goal "Open Notepad"
+oa evals run --agent policy --checkpoint ./model
+oa ml serve --checkpoint ./model  # Serve model for inference
+oa ml dashboard  # Training dashboard
+```
+- Full ML training capabilities
+- Uses `oa evals` for evaluation
+- Trains custom agents with `oa ml`, evaluates on benchmarks
+
+**CLI Namespacing**: `oa evals <cmd>` for benchmarks, `oa ml <cmd>` for training. Clear ownership of commands.
+
+---
+
+## Part 2: Package Structures
+
+### openadapt-evals (Foundation)
+
+```
+openadapt-evals/
+├── openadapt_evals/
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # BENCHMARK FRAMEWORK
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── schemas/                    # Shared data structures
+│   │   ├── __init__.py
+│   │   ├── actions.py              # BenchmarkAction
+│   │   ├── observations.py         # BenchmarkObservation
+│   │   ├── tasks.py                # BenchmarkTask
+│   │   └── results.py              # BenchmarkResult
+│   │
+│   ├── adapters/                   # Benchmark environment adapters
+│   │   ├── __init__.py
+│   │   ├── base.py                 # BenchmarkAdapter interface
+│   │   └── waa/                    # Windows Agent Arena
+│   │       ├── __init__.py
+│   │       ├── mock.py             # WAAMockAdapter
+│   │       └── live.py             # WAALiveAdapter
+│   │
+│   ├── agents/                     # Benchmark agents
+│   │   ├── __init__.py
+│   │   ├── base.py                 # BenchmarkAgent interface
+│   │   ├── api_agent.py            # Claude/GPT API agent (P0 demo fix)
+│   │   ├── retrieval_agent.py      # Demo retrieval agent
+│   │   ├── scripted_agent.py       # For testing
+│   │   └── random_agent.py         # Baseline
+│   │
+│   ├── evaluation/                 # Evaluation framework
+│   │   ├── __init__.py
+│   │   ├── runner.py               # evaluate_agent_on_benchmark()
+│   │   ├── metrics.py              # compute_metrics()
+│   │   ├── data_collection.py      # ExecutionTraceCollector
+│   │   └── live_tracker.py         # LiveEvaluationTracker
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # INFRASTRUCTURE
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── infrastructure/             # VM & cloud infrastructure
+│   │   ├── __init__.py
+│   │   ├── azure_vm.py             # Azure VM lifecycle
+│   │   ├── vm_monitor.py           # VM status monitoring
+│   │   ├── session_tracker.py      # Cost/time tracking
+│   │   ├── ssh_tunnel.py           # SSH tunnel management
+│   │   ├── disk_manager.py         # Disk management
+│   │   └── docker.py               # Docker management
+│   │
+│   ├── waa_deploy/                 # WAA Docker deployment
+│   │   ├── Dockerfile
+│   │   ├── api_agent.py            # Agent for inside container
+│   │   └── install.bat
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # USER INTERFACE
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── cli/                        # CLI commands
+│   │   ├── __init__.py
+│   │   ├── main.py                 # Entry point: oa evals
+│   │   ├── vm.py                   # oa evals vm <cmd>
+│   │   ├── run.py                  # oa evals run <cmd>
+│   │   ├── view.py                 # oa evals view
+│   │   └── tasks.py                # oa evals tasks
+│   │
+│   ├── dashboard/                  # Monitoring dashboard
+│   │   ├── __init__.py
+│   │   ├── server.py               # HTTP server
+│   │   ├── api.py                  # REST endpoints
+│   │   └── viewers/                # HTML generation
+│   │       ├── benchmark.py
+│   │       └── azure_ops.py
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # UTILITIES
+│   ├── ══════════════════════════════════════════
+│   │
+│   └── config.py                   # Configuration (API keys, Azure, etc.)
+│
+├── tests/
+│   ├── test_adapters.py
+│   ├── test_agents.py
+│   ├── test_runner.py
+│   ├── test_vm.py
+│   └── test_cli.py
+│
+├── docs/
+│   ├── getting_started.md
+│   ├── cli_reference.md
+│   └── vm_setup.md
+│
+├── pyproject.toml
+├── README.md                       # Benchmark-focused marketing
+└── CLAUDE.md
+```
+
+**pyproject.toml (openadapt-evals):**
+```toml
+[project]
+name = "openadapt-evals"
+description = "GUI agent benchmark toolkit - WAA, OSWorld, WebArena evaluations"
+dependencies = [
+    "httpx>=0.25.0",
+    "pydantic>=2.0.0",
+    "pillow>=10.0.0",
+    "azure-cli>=2.50.0",
+    # NO torch, NO transformers, NO heavy ML deps
+]
+
+[project.scripts]
+oa = "openadapt_evals.cli.main:main"  # Provides: oa evals <cmd>
+```
+
+### openadapt-ml (Extension)
+
+```
+openadapt-ml/
+├── openadapt_ml/
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # ML TRAINING
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── training/                   # Model training
+│   │   ├── __init__.py
+│   │   ├── trainer.py              # Core trainer
+│   │   ├── trl_trainer.py          # TRL-based trainer
+│   │   ├── stub_provider.py        # Mock training for testing
+│   │   └── dashboard.py            # Training dashboard generation
+│   │
+│   ├── vlm/                        # VLM inference
+│   │   ├── __init__.py
+│   │   ├── qwen.py                 # Qwen adapter
+│   │   ├── api_adapter.py          # API-based VLM
+│   │   └── base.py
+│   │
+│   ├── baselines/                  # Baseline model adapters
+│   │   ├── __init__.py
+│   │   ├── unified_adapter.py
+│   │   └── providers/
+│   │
+│   ├── grounding/                  # UI element grounding
+│   │   ├── __init__.py
+│   │   └── gemini_grounder.py
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # AGENTS & INTEGRATION
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── agents/                     # ML-specific agents
+│   │   ├── __init__.py
+│   │   ├── policy_agent.py         # Uses trained VLM policy
+│   │   └── baseline_agent.py       # Unified baseline agent
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # CLI EXTENSION
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── cli/                        # Extended CLI
+│   │   ├── __init__.py
+│   │   ├── main.py                 # Entry point: oa ml <cmd>
+│   │   ├── train.py                # oa ml train
+│   │   ├── serve.py                # oa ml serve (model inference server)
+│   │   └── dashboard.py            # oa ml dashboard (training dashboard)
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # DATA & UTILITIES
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── ingest/                     # Data ingestion
+│   │   ├── __init__.py
+│   │   └── capture.py              # OpenAdapt capture ingestion
+│   │
+│   ├── cloud/                      # Cloud GPU training
+│   │   ├── __init__.py
+│   │   ├── lambda_labs.py
+│   │   └── azure_ml.py
+│   │
+│   ├── experiments/                # Research experiments
+│   │   ├── demo_prompt/
+│   │   └── waa_demo/
+│   │
+│   └── config.py                   # ML-specific config (extends evals config)
+│
+├── tests/
+│   ├── test_training.py
+│   ├── test_vlm.py
+│   ├── test_policy_agent.py
+│   └── test_cli.py
+│
+├── docs/
+│   ├── training_guide.md
+│   ├── model_development.md
+│   └── cloud_training.md
+│
+├── pyproject.toml
+├── README.md
+└── CLAUDE.md
+```
+
+**pyproject.toml (openadapt-ml):**
+```toml
+[project]
+name = "openadapt-ml"
+description = "ML training toolkit for OpenAdapt GUI automation agents"
+dependencies = [
+    "openadapt-evals>=0.1.0",       # Foundation dependency
+    "torch>=2.0.0",
+    "transformers>=4.40.0",
+    "trl>=0.8.0",
+    "accelerate>=0.27.0",
+    # Heavy ML deps here
+]
+
+# Note: oa entry point is registered by openadapt-evals
+# openadapt-ml extends it by registering additional subcommands
+# Implementation: oa ml <cmd> routes to openadapt_ml.cli
+```
+
+---
+
+## Part 3: CLI Design
+
+CLI uses namespaced subcommands: `oa evals <cmd>` for benchmarks, `oa ml <cmd>` for training.
+
+### oa evals (openadapt-evals)
+
+```bash
+# VM Management
+oa evals vm create         # Create Azure VM
+oa evals vm delete         # Delete VM
+oa evals vm start / stop   # Start/stop VM
+oa evals vm deallocate     # Deallocate (stop billing)
+oa evals vm status         # Show VM status
+oa evals vm setup          # Full setup (Docker + benchmark image)
+oa evals vm probe          # Check benchmark server status
+oa evals vm diag           # Diagnostic info
+oa evals vm logs           # Container logs
+oa evals vm ssh            # Interactive SSH
+oa evals vm vnc            # Open VNC viewer
+
+# Evaluation
+oa evals mock              # Mock evaluation (no VM)
+oa evals live              # Live evaluation against server
+oa evals run               # Shorthand for common evaluation
+
+# Results & Monitoring
+oa evals view              # Generate results viewer
+oa evals dashboard         # Start monitoring dashboard
+oa evals tasks             # List available tasks
+
+# Configuration
+oa evals config            # Show/edit configuration
+oa evals config set KEY VALUE
+```
+
+### oa ml (openadapt-ml)
+
+```bash
+# Training
+oa ml train                # Start training
+oa ml train --capture /path --goal "description"
+oa ml train --config config.yaml
+oa ml train status         # Training status
+oa ml train stop           # Stop training
+
+# Model Serving (Inference)
+oa ml serve                       # Serve trained model for inference
+oa ml serve --checkpoint ./model  # Serve specific checkpoint
+oa ml serve --port 8080           # Custom port
+
+# Training Dashboard
+oa ml dashboard            # Start training dashboard
+oa ml dashboard --port 8080
+
+# Cloud Training
+oa ml cloud launch         # Launch cloud GPU instance
+oa ml cloud status         # Check cloud training
+oa ml cloud terminate      # Terminate instance
+
+# ML-specific evaluation (uses oa evals under the hood)
+oa evals run --agent policy --checkpoint ./model
+```
+
+---
+
+## Part 4: Migration Steps
+
+### Phase 1: Consolidate into openadapt-evals (Existing Repo)
+
+Since openadapt-evals already exists, we consolidate INTO it rather than creating a new repo.
+
+1. **Restructure openadapt-evals** for multi-benchmark support:
+   - Move `adapters/waa*.py` → `adapters/waa/` (subdirectory per benchmark)
+   - Move `benchmarks/cli.py` → restructure into `cli/`
+   - Move `benchmarks/dashboard_server.py` → `dashboard/`
+   - Create `infrastructure/` directory for VM/cloud code
+2. **Copy from openadapt-ml**:
+   - `benchmarks/cli.py` (VM commands) → `openadapt_evals/cli/vm.py`
+   - `benchmarks/waa_deploy/` → `openadapt_evals/waa_deploy/`
+   - `benchmarks/vm_monitor.py` → `openadapt_evals/infrastructure/vm_monitor.py`
+   - `benchmarks/session_tracker.py` → `openadapt_evals/infrastructure/session_tracker.py`
+   - `benchmarks/azure_ops_tracker.py` → `openadapt_evals/infrastructure/azure_ops_tracker.py`
+   - `cloud/ssh_tunnel.py` → `openadapt_evals/infrastructure/ssh_tunnel.py`
+3. **Clean up dead code** (after verification):
+   - Delete deprecated shims: `benchmarks/agent.py`, `benchmarks/base.py`, etc.
+   - Delete unused server patch: `server/waa_server_patch.py`, `server/evaluate_endpoint.py`
+4. **Write CLI entry point**: `evals`
+5. **Write tests**
+6. **Write README** with multi-benchmark marketing
+
+### Phase 2: Refactor openadapt-ml
+
+1. **Add dependency**: `openadapt-evals>=0.1.0`
+2. **Delete moved code**:
+   - `benchmarks/` (most of it)
+   - `cloud/local.py` (dashboard moved to evals)
+   - `cloud/ssh_tunnel.py` (moved to evals)
+3. **Keep ML-specific code**:
+   - `training/`
+   - `vlm/`
+   - `baselines/`
+   - `grounding/`
+   - `ingest/`
+   - `cloud/lambda_labs.py`, `cloud/azure_ml.py`
+   - `experiments/`
+4. **Add ML-specific agents**:
+   - `agents/policy_agent.py`
+   - `agents/baseline_agent.py`
+5. **Create extended CLI**: `oa` that imports from evals and adds training
+6. **Update imports** to use `openadapt_evals`
+7. **Update tests**
+
+### Phase 3: Update Documentation
+
+1. **Update openadapt-evals README**: Multi-benchmark focus
+   - "GUI agent benchmark toolkit - WAA, OSWorld, WebArena evaluations"
+2. **Update openadapt-ml README**: Training focus
+   - Links to openadapt-evals for evaluation
+3. **Update CLAUDE.md** in both repos
+
+### Phase 4: Publishing & Marketing
+
+1. **openadapt-evals README**: Multi-benchmark-focused
+   - "GUI agent benchmark toolkit - WAA, OSWorld, WebArena evaluations"
+   - One-liner install
+   - Quick start examples for each supported benchmark
+2. **openadapt-ml README**: Training-focused
+   - "Train custom GUI automation agents"
+   - Links to openadapt-evals for evaluation
+3. **PyPI publishing**: Publish both packages
+4. **Update main OpenAdapt docs** to reference both
+
+---
+
+## Part 5: File Mapping (Detailed)
+
+### openadapt-evals Internal Restructuring
+
+These files stay in openadapt-evals but may be reorganized:
+
+| Current Location | New Location | Notes |
+|------------------|--------------|-------|
+| **Adapters** | | |
+| `adapters/base.py` | `adapters/base.py` | BenchmarkAdapter interface (keep) |
+| `adapters/waa.py` | `adapters/waa/mock.py` | WAAMockAdapter |
+| `adapters/waa_live.py` | `adapters/waa/live.py` | WAALiveAdapter |
+| **Agents** | | |
+| `agents/base.py` | `agents/base.py` | BenchmarkAgent interface (keep) |
+| `agents/api_agent.py` | `agents/api_agent.py` | Claude/GPT agent (P0 demo fix) |
+| `agents/retrieval_agent.py` | `agents/retrieval_agent.py` | Demo retrieval |
+| `agents/scripted_agent.py` | `agents/scripted_agent.py` | For testing |
+| `agents/baseline_agent.py` | → MOVE to openadapt-ml | Uses VLM (ML dep) |
+| `agents/policy_agent.py` | → MOVE to openadapt-ml | Uses trained model (ML dep) |
+| **Evaluation** | | |
+| `benchmarks/runner.py` | `evaluation/runner.py` | Core evaluation |
+| `benchmarks/data_collection.py` | `evaluation/data_collection.py` | Trace collector |
+| `benchmarks/live_tracker.py` | `evaluation/live_tracker.py` | Live tracking |
+| `benchmarks/monitoring.py` | `evaluation/monitoring.py` | Monitoring |
+| `benchmarks/health_checker.py` | `evaluation/health_checker.py` | Health checks |
+| `evaluation/client.py` | `evaluation/client.py` | Eval client |
+| `evaluation/discovery.py` | `evaluation/discovery.py` | Service discovery |
+| **CLI** | | |
+| `benchmarks/cli.py` | `cli/eval.py` | Evaluation commands |
+| **Dashboard** | | |
+| `benchmarks/dashboard_server.py` | `dashboard/server.py` | HTTP server |
+| `benchmarks/viewer.py` | `dashboard/viewer.py` | Results viewer |
+| **Config** | | |
+| `benchmarks/config.py` | `config.py` | Configuration |
+| **Delete (dead code)** | | |
+| `server/evaluate_endpoint.py` | DELETE | Never deployed |
+| `server/waa_server_patch.py` | DELETE | Never deployed |
+| `benchmarks/auto_screenshot.py` | DELETE | Never imported |
+| `benchmarks/generate_synthetic_demos.py` | DELETE | Never imported |
+| `benchmarks/validate_demos.py` | DELETE | Never imported |
+| `benchmarks/validate_screenshots.py` | DELETE | Never imported |
+| `benchmarks/live_api.py` | DELETE | Never imported |
+| **Delete (duplicates)** | | |
+| `benchmarks/agent.py` | DELETE | Duplicate shim |
+| `benchmarks/base.py` | DELETE | Duplicate shim |
+| `benchmarks/waa.py` | DELETE | Duplicate shim |
+| `benchmarks/waa_live.py` | DELETE | Duplicate shim |
+| `benchmarks/azure.py` | DELETE | Duplicate |
+| **UI Components** | | |
+| `shared_ui/keyboard_shortcuts.py` | `shared_ui/keyboard_shortcuts.py` | UI shortcuts |
+| **Tests** | | |
+| `tests/test_api_agent_*.py` | `tests/test_api_agent_*.py` | Agent tests |
+| `tests/test_runner.py` | `tests/test_runner.py` | Runner tests |
+| `tests/test_mock_adapter.py` | `tests/test_mock_adapter.py` | Adapter tests |
+| `tests/test_retrieval_agent.py` | `tests/test_retrieval_agent.py` | Retrieval tests |
+
+### From openadapt-ml → openadapt-evals
+
+| Source (openadapt_ml/) | Destination (openadapt_evals/) | Notes |
+|------------------------|--------------------------------|-------|
+| **PR #14 Code** | | |
+| `benchmarks/cli.py` | `cli/vm.py` | ⭐ VM lifecycle commands (1300 lines) |
+| `benchmarks/waa_deploy/` | `waa_deploy/` | ⭐ Docker deployment files |
+| `benchmarks/waa_deploy/Dockerfile` | `waa_deploy/Dockerfile` | WAA image build |
+| `benchmarks/waa_deploy/api_agent.py` | `waa_deploy/api_agent.py` | In-container agent |
+| `benchmarks/waa_deploy/install.bat` | `waa_deploy/install.bat` | Windows setup |
+| `benchmarks/waa_deploy/start_waa_server.bat` | `waa_deploy/start_waa_server.bat` | Server startup |
+| **Infrastructure** | | |
+| `benchmarks/vm_monitor.py` | `infrastructure/vm_monitor.py` | VM status monitoring |
+| `benchmarks/session_tracker.py` | `infrastructure/session_tracker.py` | Cost/time tracking |
+| `benchmarks/azure_ops_tracker.py` | `infrastructure/azure_ops_tracker.py` | Azure op logging |
+| `benchmarks/disk_manager.py` | `infrastructure/disk_manager.py` | Disk management |
+| `benchmarks/dashboard.py` | `dashboard/panels.py` | Dashboard panels |
+| `cloud/ssh_tunnel.py` | `infrastructure/ssh_tunnel.py` | SSH tunnels |
+| **Dashboard Server** | | |
+| `cloud/local.py` (partial) | `dashboard/server.py` | ~90% is benchmark (extract) |
+| | | Training parts stay in openadapt-ml |
+| **Viewers** | | |
+| `benchmarks/viewer.py` | `dashboard/benchmark_viewer.py` | Benchmark viewer |
+| `training/azure_ops_viewer.py` | `dashboard/azure_ops_viewer.py` | Azure ops viewer |
+| **Skip (Duplicates - already in openadapt-evals)** | | |
+| `benchmarks/agent.py` | Skip | Already in openadapt-evals |
+| `benchmarks/base.py` | Skip | Already in openadapt-evals |
+| `benchmarks/runner.py` | Skip | Already in openadapt-evals |
+| `benchmarks/waa.py` | Skip | Already in openadapt-evals |
+| `benchmarks/waa_live.py` | Skip | Already in openadapt-evals |
+| `benchmarks/data_collection.py` | Skip | Already in openadapt-evals |
+| `benchmarks/live_tracker.py` | Skip | Already in openadapt-evals |
+| `benchmarks/azure.py` | Skip | Already in openadapt-evals |
+| `benchmarks/trace_export.py` | Skip | Not needed |
+
+### Stays in openadapt-ml (After Migration)
+
+| Directory | Contents | Notes |
+|-----------|----------|-------|
+| `training/` | trainer.py, trl_trainer.py, stub_provider.py, etc. | Core ML training |
+| `models/` | api_adapter.py, qwen_vl.py, providers/ | VLM inference |
+| `baselines/` | adapter.py, cli.py, config.py, etc. | Baseline models |
+| `grounding/` | base.py, detector.py | UI grounding |
+| `ingest/` | capture.py, loader.py, synthetic.py | Data ingestion |
+| `retrieval/` | retriever.py, demo_retriever.py, etc. | Demo retrieval |
+| `experiments/` | demo_prompt/, waa_demo/, etc. | Research |
+| `segmentation/` | cli.py, pipeline.py, etc. | Workflow segmentation |
+| `runtime/` | policy.py, safety_gate.py | Runtime policy |
+| `evals/` | grounding.py, trajectory_matching.py | Eval metrics |
+| `export/` | cli.py, parquet.py | Data export |
+| `scripts/` | train.py, compare.py, etc. | CLI scripts |
+| `schema/` | episode.py, converters.py | OR move to openadapt-evals |
+| `cloud/lambda_labs.py` | GPU training | Keep |
+| `cloud/azure_inference.py` | Azure ML | Keep |
+| `config.py` | Configuration | Extend openadapt_evals.config |
+
+### New Files in openadapt-ml (After Migration)
+
+| File | Purpose |
+|------|---------|
+| `agents/policy_agent.py` | Move from openadapt-evals (ML dep) |
+| `agents/baseline_agent.py` | Move from openadapt-evals (ML dep) |
+| `cli/main.py` | `oa` CLI entry point (extends `evals`) |
+| `cli/train.py` | Training commands |
+| `cli/serve.py` | Model inference server |
+| `cli/dashboard.py` | Training dashboard |
+
+### Delete from openadapt-ml (After Migration)
+
+| File | Reason |
+|------|--------|
+| `benchmarks/` (entire directory) | Moved to openadapt-evals |
+| `cloud/local.py` | Dashboard parts moved to openadapt-evals |
+| `cloud/ssh_tunnel.py` | Moved to openadapt-evals |
+| `training/azure_ops_viewer.py` | Moved to openadapt-evals |
+| `training/benchmark_viewer.py` | Moved to openadapt-evals |
+
+---
+
+## Part 6: Effort Estimate
+
+| Phase | Tasks | Effort |
+|-------|-------|--------|
+| 1. Restructure openadapt-evals | Reorganize files, create cli/, infrastructure/ | 3-4 hrs |
+| 2. Copy VM code from openadapt-ml | Move PR #14 code to evals | 2-3 hrs |
+| 3. Write evals CLI | Entry point, subcommands | 2-3 hrs |
+| 4. Clean up dead code | Delete unused files (after verification) | 1-2 hrs |
+| 5. Refactor openadapt-ml | Delete moved code, add dependency | 2-3 hrs |
+| 6. Write oa CLI extension | Extends evals, adds training | 1-2 hrs |
+| 7. Update tests | Fix imports in both repos | 2-3 hrs |
+| 8. Documentation | READMEs, CLAUDE.md, docs | 2-3 hrs |
+
+**Total: ~16-22 hours (2-3 days)**
+
+---
+
+## Part 7: Success Criteria
+
+### openadapt-evals
+
+- [ ] `pip install openadapt-evals` works
+- [ ] `oa evals --help` shows all commands
+- [ ] `oa evals vm status` works (no ML deps imported)
+- [ ] `oa evals mock --tasks 5` works
+- [ ] `oa evals run --agent gpt-4o` works (with VM running)
+- [ ] All tests pass
+- [ ] No PyTorch/transformers in dependencies
+- [ ] README has multi-benchmark quick start (WAA, OSWorld, WebArena)
+
+### openadapt-ml
+
+- [ ] `pip install openadapt-ml` installs openadapt-evals too
+- [ ] `oa ml --help` shows training commands
+- [ ] `oa ml train --help` works
+- [ ] `oa evals run --agent policy` works with trained model
+- [ ] All tests pass
+- [ ] Imports from openadapt_evals work correctly
+- [ ] Dependency direction: openadapt-ml → openadapt-evals (not circular)
+
+---
+
+## Part 8: Marketing Positioning
+
+### openadapt-evals
+
+**Tagline**: "GUI agent benchmark toolkit - evaluate agents on WAA, OSWorld, WebArena"
+
+**README opener**:
+```markdown
+# openadapt-evals
+
+The easiest way to run GUI agent benchmarks.
+
+## Quick Start
+
+```bash
+pip install openadapt-evals
+oa evals vm setup                    # One-time Azure VM setup
+oa evals run --agent gpt-4o --tasks 10
+oa evals view                        # See results
+```
+
+No ML dependencies. No complex setup. Just benchmarks.
+
+## Supported Benchmarks
+
+- **Windows Agent Arena (WAA)** - 154 Windows desktop tasks
+- **OSWorld** - Cross-platform desktop (coming soon)
+- **WebArena/VisualWebArena** - Browser tasks (coming soon)
+```
+
+**Target audience**: Researchers evaluating agents, teams benchmarking LLM capabilities
+
+### openadapt-ml
+
+**Tagline**: "Train custom GUI automation agents"
+
+**README opener**:
+```markdown
+# openadapt-ml
+
+Train and fine-tune VLMs for GUI automation. Built on openadapt-evals.
+
+## Quick Start
+
+```bash
+pip install openadapt-ml
+oa ml train --capture ./recording --goal "Open Notepad and type Hello"
+oa evals run --agent policy --checkpoint ./model
+```
+
+Full ML training pipeline with benchmark evaluation built in.
+```
+
+**Target audience**: ML engineers building GUI agents, researchers training custom models
+
+---
+
+## Part 9: Future Considerations
+
+### Adding New Benchmarks
+
+To add a new benchmark (e.g., OSWorld, WebArena):
+
+1. Create adapter in `openadapt_evals/adapters/{benchmark}/`
+2. Add CLI commands in `openadapt_evals/cli/{benchmark}.py`
+3. Add VM/container setup if needed in `infrastructure/`
+4. Update README with benchmark-specific quick start
+
+No new repos needed - openadapt-evals supports all benchmarks.
+
+### If We Split Again
+
+The two-package structure is already clean. If further splitting needed:
+
+- **openadapt-evals-azure**: Azure-specific infrastructure (for non-Azure users)
+- **openadapt-evals-local**: Local-only running (Docker on local machine)
+
+### Integration with Main OpenAdapt
+
+```
+OpenAdapt (main)          # Capture/recording
+    ↓ recordings
+openadapt-ml              # Training
+    ↓ trained models
+openadapt-evals           # Evaluation
+    ↓ benchmark results
+```
+
+The full pipeline: Capture → Train → Evaluate
+
+### openadapt-viewer Integration
+
+Both packages can use openadapt-viewer for HTML generation:
+```toml
+# Optional dependency
+[project.optional-dependencies]
+viewer = ["openadapt-viewer>=0.1.0"]
+```
diff --git a/docs/WAA_PARALLELIZATION_DESIGN.md b/docs/WAA_PARALLELIZATION_DESIGN.md
new file mode 100644
index 0000000..40d7684
--- /dev/null
+++ b/docs/WAA_PARALLELIZATION_DESIGN.md
@@ -0,0 +1,331 @@
+# WAA Benchmark Parallelization Design
+
+**Last Updated:** 2026-01-29
+
+## Overview
+
+This document describes two approaches for running Windows Agent Arena (WAA) benchmarks:
+
+1. **Dedicated VM Approach** (our current setup) - For development, debugging, small runs
+2. **Azure ML Compute Approach** (official WAA) - For full benchmark runs at scale
+
+## Official WAA Approach: Azure ML Compute
+
+The official WAA repository uses Azure ML Compute Instances for parallelization.
+
+**Source:** [README.md](https://github.com/microsoft/WindowsAgentArena/blob/main/README.md)
+> "WAA supports the deployment of agents **at scale** using the Azure ML cloud infrastructure, allowing for the parallel running of multiple agents and delivering quick benchmark results for hundreds of tasks in minutes, not days."
+
+**Implementation:** [scripts/run_azure.py](https://github.com/microsoft/WindowsAgentArena/blob/main/scripts/run_azure.py)
+
+```python
+# Official WAA creates Azure ML Compute Instances
+from azure.ai.ml.entities import ComputeInstance
+
+compute_instance = ComputeInstance(
+    name=f"w{worker_id}Exp{exp_name}",
+    size="Standard_D8_v3",  # 8 vCPU, nested virtualization
+    setup_scripts=setup_scripts,
+    idle_time_before_shutdown_minutes=600,
+    ssh_public_access_enabled=True
+)
+ml_client.begin_create_or_update(compute_instance).result()
+
+# Uses multiprocessing.Process for parallel workers
+for worker_id in range(num_workers):
+    p = Process(target=launch_vm_and_job, args=(worker_id, ...))
+    processes.append(p)
+    p.start()
+```
+
+---
+
+## Our Approach: Dedicated Azure VM
+
+We use a single dedicated Azure VM for development and debugging.
+
+### Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│                         LOCAL MACHINE                                    │
+│                                                                          │
+│  openadapt-ml CLI                                                        │
+│  ├── SSH tunnel for VNC (localhost:8006 → VM:8006)                      │
+│  ├── SSH tunnel for WAA API (localhost:5001 → VM:5000)                  │
+│  └── Direct SSH for commands                                             │
+│                                                                          │
+└─────────────────────────────────┬───────────────────────────────────────┘
+                                  │
+                                  ▼
+                           ┌─────────────┐
+                           │ waa-eval-vm │
+                           │ D4ds_v4     │
+                           │             │
+                           │ Docker      │
+                           │   └─QEMU    │
+                           │     └─Win11 │
+                           │       └─WAA │
+                           └─────────────┘
+```
+
+---
+
+## When to Use Which Approach
+
+| Use Case | Dedicated VM | Azure ML Compute |
+|----------|--------------|------------------|
+| **Development/debugging** | ✅ Better - VNC, SSH, full control | ❌ Harder to debug |
+| **Single task testing** | ✅ Simpler | ❌ Overkill |
+| **Quick iteration** | ✅ VM stays running | ❌ Compute instances spin up/down |
+| **Cost for small runs** | ✅ One VM, pay as you go | ❌ ML workspace overhead |
+| **Parallel at scale (40+ workers)** | ❌ Manual VM management | ✅ Designed for this |
+| **Full 154-task benchmark** | ❌ ~5 hours sequential | ✅ ~30 min with 10 workers |
+
+**Recommendation:**
+- Use **dedicated VM** for development and debugging
+- Use **Azure ML Compute** (official approach) for full benchmark runs
+
+---
+
+## Dedicated VM Details
+
+### Current Setup
+
+- **VM Name:** `waa-eval-vm`
+- **Size:** `Standard_D4ds_v4` (4 vCPU, 16GB RAM, nested virtualization)
+- **IP:** 20.12.180.208
+- **OS:** Ubuntu 22.04 LTS
+- **Software:** Docker with `windowsarena/winarena:latest`
+
+### CLI Commands
+
+```bash
+# VM management
+uv run python -m openadapt_ml.benchmarks.cli create      # Create VM
+uv run python -m openadapt_ml.benchmarks.cli status      # Check status
+uv run python -m openadapt_ml.benchmarks.cli probe       # Check WAA server
+uv run python -m openadapt_ml.benchmarks.cli vnc         # Open VNC tunnel
+uv run python -m openadapt_ml.benchmarks.cli logs        # View logs
+uv run python -m openadapt_ml.benchmarks.cli deallocate  # Stop billing
+uv run python -m openadapt_ml.benchmarks.cli delete      # Delete VM
+```
+
+### Access
+
+- **VNC:** http://localhost:8006 (via SSH tunnel)
+- **SSH:** `ssh azureuser@20.12.180.208`
+
+---
+
+## Azure ML Compute Details (Official WAA)
+
+### Setup Requirements
+
+1. Azure subscription with ML workspace
+2. Storage account for golden image
+3. Compute instance startup script
+4. vCPU quota (8 vCPU per worker × N workers)
+
+### Running Official WAA at Scale
+
+```bash
+cd WindowsAgentArena
+
+# Run with 10 workers
+python scripts/run_azure.py \
+    --num_workers 10 \
+    --agent navi \
+    --model_name gpt-4o \
+    --json_name evaluation_examples_windows/test_all.json
+```
+
+### Cost Estimate (Azure ML)
+
+| Workers | VM Size | vCPUs Each | Total vCPUs | Time for 154 tasks | Est. Cost |
+|---------|---------|------------|-------------|-------------------|-----------|
+| 1 | D8_v3 | 8 | 8 | ~5 hours | ~$2 |
+| 5 | D8_v3 | 8 | 40 | ~1 hour | ~$2 |
+| 10 | D8_v3 | 8 | 80 | ~30 min | ~$2 |
+
+---
+
+## Components
+
+### 1. Dedicated Azure VMs
+
+Each VM is identical:
+- **Size:** `Standard_D4ds_v4` (4 vCPU, 16GB RAM, nested virtualization)
+- **OS:** Ubuntu 22.04 LTS
+- **Software:** Docker with `windowsarena/winarena:latest` image
+- **Inside Docker:** QEMU running Windows 11 with WAA Flask server
+
+### 2. Task Distribution
+
+- 154 total WAA tasks
+- Tasks distributed round-robin across N VMs
+- Each VM runs tasks sequentially (WAA limitation - one Windows instance per container)
+- No inter-VM communication needed (embarrassingly parallel)
+
+### 3. Orchestration (ThreadPoolExecutor)
+
+```python
+# Simplified pseudocode
+with ThreadPoolExecutor(max_workers=N) as executor:
+    # Phase 1: Create VMs in parallel
+    vm_futures = [executor.submit(create_vm, f"waa-eval-vm-{i}") for i in range(N)]
+    vms = [f.result() for f in vm_futures]
+
+    # Phase 2: Distribute tasks
+    task_assignments = distribute_tasks(tasks, vms)  # round-robin
+
+    # Phase 3: Run tasks in parallel (one thread per VM)
+    result_futures = [
+        executor.submit(run_tasks_on_vm, vm, assigned_tasks)
+        for vm, assigned_tasks in task_assignments
+    ]
+    results = [f.result() for f in result_futures]
+
+    # Phase 4: Cleanup VMs
+    for vm in vms:
+        executor.submit(delete_vm, vm)
+```
+
+## Tradeoffs: Dedicated VM vs Azure ML Compute
+
+| Aspect | Dedicated VM (Our Approach) | Azure ML Compute (Official WAA) |
+|--------|----------------------------|--------------------------------|
+| **Best for** | Development, debugging, small runs | Full benchmark at scale |
+| **Simplicity** | Simple Azure CLI | Complex ML SDK |
+| **Control** | Full control, VNC, SSH | Managed (less visibility) |
+| **Debugging** | Easy - VNC shows Windows | Harder - logs only |
+| **Parallelization** | Manual (multiple VMs) | Built-in (num_workers flag) |
+| **Cost** | Pay for VM only | VM + ML workspace |
+| **Dependencies** | Azure CLI | Azure ML SDK, workspace, storage |
+
+**Decision:** Use BOTH approaches for different purposes.
+
+## VM Lifecycle
+
+```
+┌──────────┐     ┌──────────┐     ┌──────────┐     ┌──────────┐
+│  CREATE  │────▶│  SETUP   │────▶│   RUN    │────▶│  DELETE  │
+└──────────┘     └──────────┘     └──────────┘     └──────────┘
+     │                │                │                │
+     ▼                ▼                ▼                ▼
+  az vm create    docker pull      run.py           az vm delete
+  ~2 min          winarena:latest  tasks            ~1 min
+                  Windows boot     ~2 min/task
+                  ~15 min (first)
+                  ~3 min (cached)
+```
+
+### Optimization: Pre-warmed VM Pool
+
+To avoid 15-minute first-boot time:
+1. Create VMs once with Windows installed
+2. **Deallocate** (stops billing, preserves disk)
+3. **Start** when needed (~2 min)
+4. Run tasks
+5. **Deallocate** again (not delete)
+
+```bash
+# Initial setup (once)
+uv run python -m openadapt_ml.benchmarks.cli create --name waa-eval-vm-1
+uv run python -m openadapt_ml.benchmarks.cli create --name waa-eval-vm-2
+# ... wait for Windows to install on each ...
+
+# Before benchmark run
+uv run python -m openadapt_ml.benchmarks.cli vm-start --name waa-eval-vm-1
+uv run python -m openadapt_ml.benchmarks.cli vm-start --name waa-eval-vm-2
+
+# After benchmark run (stops billing, keeps disk)
+uv run python -m openadapt_ml.benchmarks.cli deallocate --name waa-eval-vm-1
+uv run python -m openadapt_ml.benchmarks.cli deallocate --name waa-eval-vm-2
+```
+
+## Scaling Considerations
+
+### Azure vCPU Quota
+
+| VM Size | vCPUs | Max VMs (10 vCPU quota) | Max VMs (40 vCPU quota) |
+|---------|-------|-------------------------|-------------------------|
+| D4ds_v4 | 4 | 2 | 10 |
+| D2ds_v4 | 2 | 5 | 20 |
+
+**Current quota:** 10 vCPUs (Standard D Family)
+**Recommended:** Request increase to 40+ vCPUs for 10 parallel VMs
+
+### Cost Estimate
+
+| Workers | VM Size | $/hr each | Total $/hr | 154 tasks @ 2min/task | Total Cost |
+|---------|---------|-----------|------------|----------------------|------------|
+| 1 | D4ds_v4 | $0.19 | $0.19 | 5.1 hrs | ~$1.00 |
+| 5 | D4ds_v4 | $0.19 | $0.95 | 1.0 hr | ~$1.00 |
+| 10 | D4ds_v4 | $0.19 | $1.90 | 0.5 hr | ~$1.00 |
+
+**Note:** More workers = faster completion, similar total cost (dominated by compute time, not wall time).
+
+## CLI Commands (Proposed)
+
+```bash
+# Create a pool of VMs
+uv run python -m openadapt_ml.benchmarks.cli pool create --count 5
+
+# Start all VMs in pool
+uv run python -m openadapt_ml.benchmarks.cli pool start
+
+# Run benchmark across pool
+uv run python -m openadapt_ml.benchmarks.cli run --parallel --tasks 154
+
+# Deallocate pool (stop billing)
+uv run python -m openadapt_ml.benchmarks.cli pool deallocate
+
+# Delete pool entirely
+uv run python -m openadapt_ml.benchmarks.cli pool delete
+```
+
+## Implementation Plan
+
+### Phase 1: Single Dedicated VM (DONE)
+- [x] Create VM with CLI (`uv run python -m openadapt_ml.benchmarks.cli create`)
+- [x] Run WAA benchmarks on single VM
+- [x] VNC access for debugging
+- [x] Results collection
+
+### Phase 2: Scale with Official WAA (TODO)
+- [ ] Set up Azure ML workspace (if not exists)
+- [ ] Upload golden image to storage account
+- [ ] Configure `scripts/run_azure.py` with our credentials
+- [ ] Request vCPU quota increase (80+ for 10 workers)
+- [ ] Run full 154-task benchmark with `--num_workers 10`
+
+### Phase 3: Integration (OPTIONAL)
+- [ ] Wrapper CLI to invoke official `run_azure.py`
+- [ ] Results download and analysis
+- [ ] Cost tracking
+
+**Note:** We're NOT building our own VM pool management. The official WAA `run_azure.py` already does this well.
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `openadapt_ml/benchmarks/cli.py` | CLI for single dedicated VM (dev/debug) |
+| `vendor/WindowsAgentArena/scripts/run_azure.py` | Official WAA parallel execution |
+
+## Related Documents
+
+- `docs/WAA_APPROACH_REVIEW.md` - Why vanilla WAA, not custom Dockerfile
+- `CLAUDE.md` - CLI-first development guidelines
+- `/Users/abrichr/oa/src/STATUS.md` - Project priorities
+- [Official WAA README](https://github.com/microsoft/WindowsAgentArena/blob/main/README.md) - Azure ML setup instructions
+
+## Decision Log
+
+| Date | Decision | Rationale |
+|------|----------|-----------|
+| 2026-01-29 | Use dedicated VM for dev/debug | Full control, VNC, easy iteration |
+| 2026-01-29 | Use official WAA `run_azure.py` for scale | Don't reinvent the wheel |
+| 2026-01-29 | Don't build custom VM pool | Official WAA already handles this |
+| 2026-01-29 | ThreadPoolExecutor sufficient | Ray is overkill (agent a7d43c3 analysis) |
diff --git a/docs/WAA_UNATTENDED_SCALABLE.md b/docs/WAA_UNATTENDED_SCALABLE.md
new file mode 100644
index 0000000..2de68c3
--- /dev/null
+++ b/docs/WAA_UNATTENDED_SCALABLE.md
@@ -0,0 +1,298 @@
+# Unattended Scalable Programmatic WAA
+
+**Last Updated:** 2026-01-29
+
+## Goal
+
+Run Windows Agent Arena (WAA) benchmark with:
+- **Unattended**: No manual intervention (Windows auto-installs, server auto-starts)
+- **Scalable**: N parallel workers (10+ for full 154-task benchmark in ~30 min)
+- **Programmatic**: Single command execution
+
+## Current State
+
+### What Official WAA Provides
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│  LOCAL: python scripts/run_azure.py --num_workers 10                    │
+└─────────────────────────────────┬───────────────────────────────────────┘
+                                  │
+                                  ▼
+┌─────────────────────────────────────────────────────────────────────────┐
+│                     AZURE ML WORKSPACE                                   │
+│                                                                          │
+│  ┌──────────┐  ┌──────────┐  ┌──────────┐       ┌──────────┐           │
+│  │ Compute  │  │ Compute  │  │ Compute  │  ...  │ Compute  │           │
+│  │ Instance │  │ Instance │  │ Instance │       │ Instance │           │
+│  │ Worker 0 │  │ Worker 1 │  │ Worker 2 │       │ Worker N │           │
+│  └────┬─────┘  └────┬─────┘  └────┬─────┘       └────┬─────┘           │
+│       │             │             │                   │                 │
+│       ▼             ▼             ▼                   ▼                 │
+│  ┌─────────────────────────────────────────────────────────────────┐   │
+│  │  Each instance runs: Docker → QEMU → Windows → WAA → Navi       │   │
+│  └─────────────────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+**Pros:**
+- ✅ Parallelization built-in (`--num_workers N`)
+- ✅ Azure ML handles compute lifecycle
+- ✅ Auto-shutdown on idle
+- ✅ Results to Azure Storage
+
+**Cons:**
+- ❌ Only supports Navi agent (not our API agents)
+- ❌ Requires pre-uploaded golden image to Azure Storage
+- ❌ Complex Azure ML setup (workspace, storage, startup script)
+- ❌ Limited debugging (no VNC)
+
+### What We Built
+
+| Component | Purpose | Useful? |
+|-----------|---------|---------|
+| `waa_deploy/Dockerfile` | Auto-download Windows, API agent support | ✅ For dev |
+| `waa_deploy/api_agent.py` | Claude/OpenAI agent (alternative to Navi) | ✅ Key differentiator |
+| `cli.py` | Dedicated VM management | ✅ For dev/debug |
+| `WAALiveAdapter` | Connects to WAA server API | ✅ Portable |
+| `ApiAgent` | Structured actions via LLM API | ✅ Portable |
+
+---
+
+## Synthesized Approach
+
+### Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│  LOCAL: uv run python -m openadapt_ml.benchmarks.cli scale              │
+│           --workers 10 --agent api-openai --tasks 154                   │
+└─────────────────────────────────┬───────────────────────────────────────┘
+                                  │
+                    ┌─────────────┴─────────────┐
+                    │   Use official run_azure.py│
+                    │   for compute orchestration│
+                    └─────────────┬─────────────┘
+                                  │
+                                  ▼
+┌─────────────────────────────────────────────────────────────────────────┐
+│                     AZURE ML COMPUTE INSTANCES                           │
+│                                                                          │
+│  Each instance runs our modified Docker image:                          │
+│  ┌─────────────────────────────────────────────────────────────────┐   │
+│  │  waa-auto:latest (our Dockerfile)                                │   │
+│  │  ├── dockurr/windows (auto-downloads Windows 11)                │   │
+│  │  ├── windowsarena/winarena components                           │   │
+│  │  ├── api_agent.py (Claude/OpenAI support)                       │   │
+│  │  └── Auto-start WAA server on boot                              │   │
+│  └─────────────────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+### Key Insight
+
+**Don't reinvent parallelization.** Use official `run_azure.py` for compute orchestration, but:
+1. Replace their Docker image with ours (`waa-auto:latest`)
+2. Add our API agent to the agent options
+
+---
+
+## Implementation Plan
+
+### Phase 1: Validate Single Worker (DONE)
+
+- [x] Dedicated VM working (`waa-eval-vm`)
+- [x] VNC access for debugging
+- [x] WAA server auto-starts
+- [x] Benchmark runs with Navi agent
+
+### Phase 2: Add API Agent to Official WAA
+
+**Goal:** Run `python run_azure.py --agent api-openai`
+
+**Steps:**
+
+1. **Create NaviAgent-compatible wrapper:**
+   ```python
+   # mm_agents/openadapt/agent.py
+   class OpenAdaptAgent:
+       """Wrapper to use our ApiAgent with official WAA runner."""
+
+       def __init__(self, model="gpt-4o", provider="openai"):
+           self.provider = provider
+           self.model = model
+           # Initialize API client
+
+       def predict(self, instruction: str, obs: Dict) -> List[str]:
+           """Convert observation → API call → action code."""
+           # 1. Extract screenshot from obs
+           # 2. Call OpenAI/Claude API
+           # 3. Parse response to action
+           # 4. Return as Python code string
+           return [f"computer.mouse.click(x={x}, y={y})"]
+
+       def reset(self):
+           self.history = []
+   ```
+
+2. **Modify official `run.py` to support new agent:**
+   ```python
+   # In run.py, add:
+   elif cfg_args["agent_name"] == "api-openai":
+       from mm_agents.openadapt.agent import OpenAdaptAgent
+       agent = OpenAdaptAgent(provider="openai", model=cfg_args["model"])
+   elif cfg_args["agent_name"] == "api-claude":
+       from mm_agents.openadapt.agent import OpenAdaptAgent
+       agent = OpenAdaptAgent(provider="anthropic", model=cfg_args["model"])
+   ```
+
+3. **Test locally first:**
+   ```bash
+   # On dedicated VM
+   cd /client
+   python run.py --agent api-openai --model gpt-4o --test_all_meta_path ...
+   ```
+
+### Phase 3: Push Custom Image to Azure
+
+**Goal:** Azure ML uses our `waa-auto:latest` instead of `windowsarena/winarena:latest`
+
+**Steps:**
+
+1. **Push to Azure Container Registry:**
+   ```bash
+   # Build locally
+   docker build -t waa-auto:latest -f waa_deploy/Dockerfile .
+
+   # Tag for ACR
+   docker tag waa-auto:latest openadaptacr.azurecr.io/waa-auto:latest
+
+   # Push
+   az acr login --name openadaptacr
+   docker push openadaptacr.azurecr.io/waa-auto:latest
+   ```
+
+2. **Modify `run_azure.py` to use our image:**
+   ```python
+   # Change default:
+   parser.add_argument('--docker_img_name',
+       default='openadaptacr.azurecr.io/waa-auto:latest',  # Was: windowsarena/winarena:latest
+       help='Docker image name')
+   ```
+
+### Phase 4: Wrapper CLI
+
+**Goal:** Single command for everything
+
+```bash
+# Full benchmark with 10 workers
+uv run python -m openadapt_ml.benchmarks.cli scale \
+    --workers 10 \
+    --agent api-openai \
+    --model gpt-4o \
+    --tasks all
+
+# Subset for testing
+uv run python -m openadapt_ml.benchmarks.cli scale \
+    --workers 2 \
+    --agent api-claude \
+    --tasks notepad_1,notepad_2,browser_1
+```
+
+**Implementation:**
+```python
+# In cli.py, add 'scale' command that:
+# 1. Ensures Azure ML workspace exists
+# 2. Ensures our image is in ACR
+# 3. Calls run_azure.py with appropriate args
+# 4. Monitors progress
+# 5. Downloads results when done
+```
+
+---
+
+## File Changes Required
+
+| File | Change | Effort |
+|------|--------|--------|
+| `mm_agents/openadapt/agent.py` | NEW: NaviAgent-compatible wrapper | ~100 lines |
+| `run.py` | MODIFY: Add api-openai/api-claude agent options | ~10 lines |
+| `waa_deploy/Dockerfile` | EXISTING: Already has api_agent.py | Done |
+| `cli.py` | ADD: `scale` command | ~200 lines |
+| `run_azure.py` | MODIFY: Default to our Docker image | ~5 lines |
+
+---
+
+## Prerequisites
+
+### Azure Setup (One-time)
+
+1. **Azure ML Workspace** (if not exists)
+   ```bash
+   az ml workspace create -n openadapt-ml -g openadapt-agents
+   ```
+
+2. **Azure Container Registry**
+   ```bash
+   az acr create -n openadaptacr -g openadapt-agents --sku Basic
+   ```
+
+3. **vCPU Quota** (request increase)
+   - Standard_D8_v3: 8 vCPUs per worker
+   - 10 workers = 80 vCPUs needed
+   - Request via Azure Portal → Quotas
+
+4. **Upload startup script** to Azure ML Notebooks
+   - Path: `Users/<user>/compute-instance-startup.sh`
+   - Content: From `scripts/azure_files/compute-instance-startup.sh`
+
+### Environment Variables
+
+```bash
+# .env file
+AZURE_SUBSCRIPTION_ID=...
+AZURE_ML_RESOURCE_GROUP=openadapt-agents
+AZURE_ML_WORKSPACE_NAME=openadapt-ml
+OPENAI_API_KEY=sk-...
+ANTHROPIC_API_KEY=sk-ant-...
+```
+
+---
+
+## Cost Estimate
+
+| Workers | VM Size | Time for 154 tasks | Compute Cost | API Cost (GPT-4o) | Total |
+|---------|---------|-------------------|--------------|-------------------|-------|
+| 1 | D8_v3 | ~5 hours | ~$2.50 | ~$5 | ~$7.50 |
+| 5 | D8_v3 | ~1 hour | ~$2.50 | ~$5 | ~$7.50 |
+| 10 | D8_v3 | ~30 min | ~$2.50 | ~$5 | ~$7.50 |
+
+**Note:** More workers = faster, same total cost (compute + API calls are constant).
+
+---
+
+## Summary
+
+| Aspect | Approach |
+|--------|----------|
+| **Parallelization** | Use official `run_azure.py` (Azure ML Compute) |
+| **Docker Image** | Our `waa-auto:latest` (auto-download Windows, API agents) |
+| **Agent** | Our `OpenAdaptAgent` wrapper (uses Claude/OpenAI) |
+| **CLI** | Wrapper command `cli.py scale` |
+| **Development** | Dedicated VM with VNC for debugging |
+
+**Total new code:** ~300 lines
+**Reused from official WAA:** Parallelization, compute management, task distribution
+**Reused from our work:** Dockerfile, api_agent.py, WAALiveAdapter concepts
+
+---
+
+## Next Steps
+
+1. [ ] Create `mm_agents/openadapt/agent.py` wrapper (~100 lines)
+2. [ ] Test on dedicated VM with `--agent api-openai`
+3. [ ] Push `waa-auto:latest` to Azure Container Registry
+4. [ ] Modify `run_azure.py` to use our image
+5. [ ] Add `scale` command to CLI
+6. [ ] Request vCPU quota increase (80+ for 10 workers)
+7. [ ] Run full 154-task benchmark
diff --git a/docs/research/cua_waa_comparison.md b/docs/research/cua_waa_comparison.md
new file mode 100644
index 0000000..f4b12d0
--- /dev/null
+++ b/docs/research/cua_waa_comparison.md
@@ -0,0 +1,607 @@
+# Cua vs OpenAdapt-ML Windows Agent Arena (WAA) Implementation Comparison
+
+**Date**: 2026-01-28 (Updated)
+**Status**: Research Analysis
+**Author**: Research Agent
+
+---
+
+## Quick Reference: Key Metrics
+
+| Metric | Cua/OpenAI CUA | OpenAdapt-ML | Microsoft WAA (Navi) |
+|--------|----------------|--------------|----------------------|
+| WAA Success Rate | N/A (OSWorld: 38.1%) | In progress | 19.5% (GPT-4V) |
+| OSWorld Success Rate | 38.1% (OpenAI CUA) | Not implemented | N/A |
+| Human Baseline | 72-74.5% | 74.5% (WAA) | 74.5% |
+| VM Setup Time | Minutes (Lume) | ~15-20 min (Azure) | ~20 min |
+| Primary Platform | macOS (Apple Silicon) | Windows (Azure) | Windows (Azure) |
+
+---
+
+## Executive Summary
+
+This document analyzes [Cua (trycua/cua)](https://github.com/trycua/cua), a YC X25-backed open-source platform for Computer-Use Agents, and compares it with our OpenAdapt-Evals/OpenAdapt-ML two-package architecture.
+
+**Key Finding**: Cua represents a significantly more comprehensive infrastructure platform that addresses many problems we've been solving piecemeal. However, adopting Cua wholesale would require substantial architectural changes and has notable trade-offs around Windows/Azure focus, Apple Silicon dependency, and our training pipeline integration.
+
+**Recommendation**: Consider incremental adoption of Cua components, starting with cua-bench adapters for benchmark standardization, rather than full migration.
+
+---
+
+## 1. What is Cua?
+
+### Overview
+
+Cua ("koo-ah") is an open-source infrastructure platform for developing, evaluating, and deploying Computer-Use Agents. According to their [Hacker News launch](https://news.ycombinator.com/item?id=46768906) and [HuggingFace blog](https://huggingface.co/blog/cua-ai/cua-bench):
+
+> "Cua is Docker for Computer-Use AI Agents - it enables AI agents to control full operating systems in virtual containers and deploy them locally or to the cloud."
+
+### Core Components
+
+The Cua ecosystem is organized as a monorepo with these key packages:
+
+| Package | Purpose | Tech Stack |
+|---------|---------|------------|
+| **cua-agent** | AI agent framework for computer-use tasks | Python |
+| **cua-computer** | SDK for controlling desktop environments | Python |
+| **cua-computer-server** | Sandbox driver for UI interactions | Python/FastAPI |
+| **cua-bench** | Benchmarks and RL environments | Python |
+| **lume** | macOS/Linux VM management on Apple Silicon | Swift/CLI |
+| **lumier** | Docker-compatible interface for Lume VMs | Python |
+| **som** | Set-of-Mark for OmniParser integration | Python |
+| **pylume** | Python bindings for Lume | Python |
+| **mcp-server** | Multi-Modal Control Protocol server for Claude Desktop | Python |
+
+### Key Capabilities
+
+1. **Multi-Platform Virtualization**:
+   - macOS/Linux via Apple Virtualization Framework (97% native CPU speed on Apple Silicon)
+   - Windows via Docker/QEMU
+   - Cloud deployment support
+
+2. **Composite Agents Architecture**:
+   - Separate grounding model (fast, small) from reasoning model (large)
+   - Model-agnostic: supports Anthropic, OpenAI, Google, Ollama, LM Studio
+
+3. **Unified Benchmark Framework (cua-bench)**:
+   - Adapters for OSWorld, ScreenSpot, WindowsArena
+   - Trajectory export for training
+   - RL environment support
+
+4. **Training Data Generation**:
+   - "Trajectory replotting": Record 1 demo, render across 10 OS themes = 10 training trajectories
+   - HTML snapshots with bounding boxes, not just screenshots
+   - Multi-resolution (640x480 to 3440x1440)
+
+---
+
+## 2. Cua's Approach to Computer Use Automation
+
+### Architecture Philosophy
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                         Cua Platform                             │
+├─────────────────────────────────────────────────────────────────┤
+│  Agent Layer (cua-agent)                                        │
+│  ├── ComputerAgent - Main agent class                          │
+│  ├── Provider adapters (Anthropic, OpenAI, Ollama, etc.)       │
+│  └── Composite agents (grounding + reasoning split)            │
+├─────────────────────────────────────────────────────────────────┤
+│  Computer Layer (cua-computer)                                  │
+│  ├── Computer class - Unified interface                         │
+│  ├── Display drivers (screen capture, coordinates)             │
+│  └── Input drivers (mouse, keyboard)                            │
+├─────────────────────────────────────────────────────────────────┤
+│  Sandbox Layer                                                   │
+│  ├── Lume (Apple Silicon VMs - macOS/Linux)                    │
+│  ├── Docker/QEMU (Windows, Linux)                               │
+│  └── Cloud containers (cua-cloud)                               │
+├─────────────────────────────────────────────────────────────────┤
+│  Benchmark Layer (cua-bench)                                    │
+│  ├── OSWorld adapter                                             │
+│  ├── WindowsArena adapter                                        │
+│  ├── ScreenSpot adapter                                          │
+│  └── Custom task definitions                                     │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### Key Technical Decisions
+
+1. **Sandbox-First**: Every agent runs in an isolated VM/container. This is non-negotiable for safety.
+
+2. **Playwright-Like API**: Tasks defined with declarative Python decorators:
+   ```python
+   @cb.setup_task
+   async def setup(env, scenario):
+       await env.spotify.open()
+       await env.spotify.create_playlist(scenario["playlist_name"])
+
+   @cb.solve_task
+   async def solve(env, scenario):
+       await env.spotify.search(scenario["song"])
+   ```
+
+3. **HTML + Screenshots**: Captures full HTML with bounding boxes, accessibility labels, and CSS - not just screenshots. This enables:
+   - Element-level grounding
+   - Style variation generation
+   - More robust training data
+
+4. **Shell Applications**: Simulated apps (Spotify, Slack clones) that run in lightweight webtops without VM overhead. Enables rapid iteration.
+
+---
+
+## 3. Comparison with Our WAA-Based Evaluation Setup
+
+### Our Current Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    OpenAdapt Ecosystem                           │
+├─────────────────────────────────────────────────────────────────┤
+│  openadapt-ml (Training)                                        │
+│  ├── training/ - VLM fine-tuning pipeline                      │
+│  ├── vlm/ - Model adapters (Qwen, API-based)                   │
+│  ├── baselines/ - Baseline model adapters                      │
+│  ├── benchmarks/cli.py - VM lifecycle management               │
+│  └── cloud/ - Lambda Labs, Azure ML                            │
+├─────────────────────────────────────────────────────────────────┤
+│  openadapt-evals (Evaluation)                                   │
+│  ├── agents/ - BenchmarkAgent implementations                  │
+│  │   ├── ApiAgent (Claude, GPT-5.1)                            │
+│  │   ├── PolicyAgent (trained models)                          │
+│  │   └── RetrievalAgent (demo-conditioned)                     │
+│  ├── adapters/ - Benchmark adapters                            │
+│  │   ├── WAAMockAdapter                                         │
+│  │   └── WAALiveAdapter                                         │
+│  └── benchmarks/ - Runner, viewer, Azure orchestration         │
+├─────────────────────────────────────────────────────────────────┤
+│  Infrastructure                                                  │
+│  ├── Azure VMs (Standard_D4ds_v5 with nested virt)             │
+│  ├── Docker + QEMU (Windows 11 Enterprise via WAA image)       │
+│  └── SSH tunnels for VNC/API access                            │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### Side-by-Side Comparison
+
+| Aspect | Cua | OpenAdapt-Evals/ML |
+|--------|-----|-------------------|
+| **Scope** | Full platform (sandboxes, SDKs, benchmarks, training) | Focused on evaluation + ML training |
+| **Sandbox Technology** | Lume (Apple Silicon) + Docker/QEMU | Azure VMs + Docker/QEMU |
+| **Primary Platform** | macOS first, then Linux/Windows | Windows first (WAA-focused) |
+| **Local Dev Experience** | Native macOS VMs on Apple Silicon | Requires Azure VM or local Docker |
+| **Benchmark Support** | OSWorld, ScreenSpot, WAA via adapters | WAA only (others planned) |
+| **Training Data Gen** | Built-in trajectory replotting | Manual demo collection |
+| **Agent Architecture** | Composite (grounding + reasoning) | Monolithic (single API call) |
+| **VM Performance** | 97% native on Apple Silicon | Nested virtualization overhead |
+| **Cloud Support** | cua-cloud (managed service coming) | Azure VMs, Lambda Labs for training |
+| **RL Support** | Native RL environments in cua-bench | Not implemented |
+| **Model Agnostic** | Yes (100+ providers) | Yes (Anthropic, OpenAI, local VLMs) |
+| **Package Count** | 8+ packages in monorepo | 2 packages |
+| **Dependencies** | Python 3.12+ required | Python 3.10+ |
+| **Lines of Code** | ~15K+ (estimated) | ~8K |
+| **Documentation** | Extensive (cua.ai/docs) | CLAUDE.md + README |
+| **Community** | YC-backed, active development | Internal OpenAdapt project |
+
+### Benchmark Framework Comparison
+
+#### cua-bench
+
+```python
+# Task definition
+@cb.tasks_config
+def config():
+    return {"scenarios": [{"playlist_name": "Workout", "song": "Eye of the Tiger"}, ...]}
+
+@cb.setup_task
+async def setup(env, scenario):
+    await env.spotify.create_playlist(scenario["playlist_name"])
+
+@cb.solve_task
+async def solve(env, scenario):
+    await env.spotify.search(scenario["song"])
+    await env.spotify.add_to_playlist(scenario["playlist_name"])
+
+@cb.evaluate_task
+async def evaluate(env, scenario):
+    playlist = await env.spotify.get_playlist(scenario["playlist_name"])
+    return scenario["song"] in playlist.songs
+```
+
+**Key Features**:
+- Declarative task definition
+- Scenario variation injection
+- Automatic trajectory recording
+- Shell application support (simulated apps)
+
+#### openadapt-evals
+
+```python
+# Task loaded from JSON
+adapter = WAALiveAdapter(server_url="http://vm:5000")
+task = adapter.load_task("notepad_1")
+
+# Agent interaction
+agent = ApiAgent(provider="anthropic")
+obs = adapter.reset(task)
+action = agent.act(obs, task)
+obs, done, info = adapter.step(action)
+result = adapter.evaluate(task)
+```
+
+**Key Features**:
+- Uses upstream WAA task definitions
+- HTTP adapter to WAA server
+- Execution trace collection
+- P0 demo persistence fix in ApiAgent
+
+---
+
+## 4. Key Differences in Architecture
+
+### 4.1 Sandbox Philosophy
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Sandboxes are the core primitive | VMs are infrastructure detail |
+| Local-first (Apple Silicon VMs) | Cloud-first (Azure VMs) |
+| Multiple sandbox types unified | Single sandbox type (WAA Docker) |
+| Safety is architectural constraint | Safety via SSH/isolation |
+
+**Implication**: Cua's sandbox-first design makes it safer and more portable, but requires Lume infrastructure which is Apple Silicon-only.
+
+### 4.2 Training Data Generation
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Trajectory replotting (1 demo → N variants) | Manual demo collection |
+| HTML + screenshots captured | Screenshots only in WAA |
+| Built-in visual diversity generation | No automatic variation |
+| Shell apps for fast iteration | Full VM required |
+
+**Implication**: Cua can generate significantly more diverse training data from fewer human demonstrations. This addresses the "10x performance variance across UI changes" problem they identified.
+
+### 4.3 Agent Architecture
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Composite agents (grounding + reasoning) | Monolithic agents |
+| Explicit OmniParser/SoM integration | SoM mode supported but not primary |
+| Cost-optimized (small model for grounding) | Full API call for each decision |
+
+**Implication**: Cua's composite approach could reduce API costs and improve grounding accuracy by using specialized models for each subtask.
+
+### 4.4 Benchmark Integration
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Unified adapter interface across benchmarks | WAA-specific adapter |
+| Native adapters for OSWorld, ScreenSpot, WAA | WAA only (others TODO) |
+| Benchmark-agnostic task format | BenchmarkTask dataclass |
+| RL environment support | Evaluation only |
+
+**Implication**: Cua already has the multi-benchmark support we're planning in REPO_CONSOLIDATION_PLAN.md.
+
+---
+
+## 5. Should We Adopt Cua or Parts of It?
+
+### Arguments FOR Adoption
+
+1. **Multi-Benchmark Support**: They've already built adapters for OSWorld, ScreenSpot, WAA - exactly what we need.
+
+2. **Training Data Generation**: Trajectory replotting would dramatically improve our training data diversity.
+
+3. **Active Development**: YC-backed with active community. They're solving the same problems we are.
+
+4. **Better Local Dev**: macOS VMs on Apple Silicon would enable faster iteration for Mac users.
+
+5. **RL Support**: Native RL environments would enable future research directions.
+
+6. **MCP Integration**: Claude Desktop integration via MCP server.
+
+### Arguments AGAINST Full Adoption
+
+1. **Apple Silicon Dependency**: Lume requires Apple Silicon. Our team uses Azure VMs which have no Apple Silicon equivalent.
+
+2. **Windows Focus Mismatch**: We're focused on Windows (WAA) for enterprise use cases. Cua is macOS-first.
+
+3. **Training Pipeline Integration**: Our training pipeline (openadapt-ml) is tightly integrated with openadapt-evals. Switching to cua-bench would require significant refactoring.
+
+4. **Operational Complexity**: 8+ packages vs our 2. More to learn and maintain.
+
+5. **Python 3.12+ Requirement**: We support Python 3.10+. Migration could break user environments.
+
+6. **Unproven at Scale**: Despite YC backing, it's still early-stage. Our WAA setup is battle-tested.
+
+7. **Azure VM Investment**: We've invested significant effort in Azure VM automation (PR #14). This would be partially wasted.
+
+---
+
+## 6. Trade-offs Analysis
+
+### Scenario A: Full Migration to Cua
+
+**Effort**: High (3-6 months)
+
+**Benefits**:
+- Unified multi-benchmark support
+- Training data generation
+- Active community support
+- MCP/Claude Desktop integration
+
+**Costs**:
+- Significant refactoring of openadapt-ml training pipeline
+- Azure VM automation work partially wasted
+- New learning curve for team
+- Potential compatibility issues with Python 3.10 users
+
+**Risk**: Medium-High (depending on Cua's stability and our ability to extend it)
+
+### Scenario B: Adopt cua-bench Adapters Only
+
+**Effort**: Medium (1-2 months)
+
+**Benefits**:
+- Standardized benchmark interface
+- Access to OSWorld, ScreenSpot adapters
+- Can still use our Azure VM infrastructure
+- Incremental migration path
+
+**Costs**:
+- Must maintain compatibility layer
+- Miss out on sandbox/Lume benefits
+- Partial adoption may cause confusion
+
+**Risk**: Low-Medium
+
+### Scenario C: Adopt Architectural Patterns Only
+
+**Effort**: Low (2-4 weeks)
+
+**Benefits**:
+- No external dependencies
+- Learn from their solutions
+- Can implement selectively
+
+**What to Adopt**:
+- Composite agent pattern (grounding + reasoning)
+- Trajectory replotting concept
+- Declarative task definition style
+- HTML capture alongside screenshots
+
+**Costs**:
+- Must implement ourselves
+- No community support
+
+**Risk**: Low
+
+### Scenario D: Stay Current Course
+
+**Effort**: None
+
+**Benefits**:
+- Known system, no learning curve
+- REPO_CONSOLIDATION_PLAN.md already addresses multi-benchmark support
+- Full control over architecture
+
+**Costs**:
+- Slower to add OSWorld, other benchmarks
+- No training data generation automation
+- Potentially duplicating work
+
+**Risk**: Low (but higher opportunity cost)
+
+---
+
+## 7. Recommendations
+
+### Immediate (Next 2-4 Weeks)
+
+1. **Do NOT migrate to Cua wholesale**. The Azure VM investment is too recent, and we have a working system.
+
+2. **Adopt the composite agent pattern** in ApiAgent:
+   - Add optional grounding model (OmniParser/SoM)
+   - Use small model for element detection, large model for reasoning
+   - This is an incremental change to existing code
+
+3. **Add HTML capture** to WAALiveAdapter:
+   - Capture accessibility tree alongside screenshots
+   - Enables future training data diversity
+
+### Medium-Term (Next 2-3 Months)
+
+4. **Evaluate cua-bench integration**:
+   - Test if cua-bench adapters can work with our evaluation runner
+   - If compatible, adopt their OSWorld/ScreenSpot adapters
+   - Keep our WAALiveAdapter for Azure VM compatibility
+
+5. **Implement trajectory replotting prototype**:
+   - Record demos with HTML + screenshots
+   - Test re-rendering across Windows themes
+   - Measure training data quality improvement
+
+### Long-Term (6+ Months)
+
+6. **Consider Lume for local development**:
+   - If team has Apple Silicon Macs
+   - Would enable faster local iteration
+   - Keep Azure VMs for CI/production
+
+7. **Contribute back to Cua**:
+   - Our Azure VM automation could benefit the community
+   - Windows-focused improvements
+
+---
+
+## 8. Specific Recommendations for REPO_CONSOLIDATION_PLAN.md
+
+Our current consolidation plan is **still valid** but should incorporate these learnings:
+
+1. **Keep the two-package split** (openadapt-evals + openadapt-ml). Cua's monorepo with 8+ packages is more complex than necessary for our use case.
+
+2. **Add benchmark adapter interface** compatible with cua-bench:
+   ```python
+   class BenchmarkAdapter(ABC):
+       # Our current interface is similar to cua-bench
+       # Add optional HTML capture in observations
+       # Add evaluation spec support
+   ```
+
+3. **Prioritize OSWorld adapter** as second benchmark (after WAA). Cua's OSWorld-Verified work validates this as the next target.
+
+4. **Consider shell applications** for testing:
+   - Simulated apps for unit tests
+   - No VM overhead for CI
+   - This is orthogonal to our VM-based evaluation
+
+5. **Document composite agent pattern** in CLAUDE.md for future implementation.
+
+---
+
+## 9. Conclusion
+
+Cua is an impressive and comprehensive platform that addresses many problems we're solving. However, full migration is not recommended at this time due to:
+
+1. Our recent Azure VM automation investment
+2. Apple Silicon dependency in Lume
+3. Windows-first focus vs their macOS-first approach
+
+Instead, we should:
+- **Learn from their architecture** (composite agents, trajectory replotting)
+- **Evaluate cua-bench adapters** for multi-benchmark support
+- **Stay on our current consolidation path** while incorporating their patterns
+
+The OpenAdapt ecosystem can achieve similar capabilities through incremental improvements rather than wholesale migration.
+
+---
+
+## 10. Market Positioning and Strategic Differentiation
+
+### 10.1 The Success Rate Gap
+
+| Agent | Benchmark | Success Rate | Gap to Human |
+|-------|-----------|--------------|--------------|
+| OpenAI CUA | OSWorld | 38.1% | ~36 pts below human (74.5%) |
+| Microsoft Navi | WAA | 19.5% | ~55 pts below human (74.5%) |
+
+**Key insight**: The problem is far from solved. Both approaches have runway—the technology isn't mature enough for either to dominate yet.
+
+The 38.1% vs 19.5% gap is significant:
+- OSWorld is macOS/Linux focused
+- WAA is Windows focused
+- **Windows automation appears harder** (more legacy complexity, more app diversity)
+
+This validates OpenAdapt's focus: Windows enterprise workflows are the harder problem.
+
+### 10.2 Market Positioning
+
+| Aspect | Cua | OpenAdapt |
+|--------|-----|-----------|
+| **Primary TAM** | AI Agents / Developer Tools (~$500M-1B, 40%+ CAGR) | Enterprise RPA + Legacy Automation (~$8-10B, 20% CAGR) |
+| **Buyer** | ML engineers, AI researchers | Ops, IT, compliance, support |
+| **Value Prop** | "Build computer-use agents faster" | "Learn automation from how you already work" |
+
+### 10.3 Why These Markets Don't Fully Overlap
+
+- Cua assumes synthetic, controlled environments
+- OpenAdapt captures real workflows from production systems
+- Enterprise compliance requirements (HIPAA, SOX) favor retrospective capture
+
+### 10.4 Where Cua's Sandbox Approach Breaks Down
+
+Cua's sandbox-first design assumes you can:
+- Spin up a clean VM with the target app
+- Control the environment end-to-end
+- Reproduce the workflow deterministically
+
+**This fails for:**
+
+| Scenario | Why Sandboxes Fail | OpenAdapt Alternative |
+|----------|-------------------|----------------------|
+| **Citrix/RDP apps** | No local install possible | Capture remote session natively |
+| **Licensed enterprise software** | SAP, Epic, Oracle—can't sandbox without licensing | Record from licensed desktop |
+| **Policy-controlled desktops** | Enterprise IT won't allow arbitrary VMs | Capture from existing desktop |
+| **Compliance-restricted environments** | Healthcare, finance—can't replicate production | Retrospective recording allowed |
+| **Multi-app workflows** | Spanning 5+ apps that can't all be sandboxed together | Single recording captures all |
+
+**OpenAdapt's retrospective recording doesn't have these constraints.**
+
+### 10.5 Shell Applications: Where Cua and OpenAdapt Could Converge
+
+Shell apps (simulated Spotify, Slack clones) serve different purposes:
+
+| Use Case | Cua's Approach | OpenAdapt's Approach |
+|----------|---------------|---------------------|
+| Unit tests | Primary use case | Could adopt for CI |
+| Training data | Synthetic generation | Not applicable (need real data) |
+| Fast iteration | Core workflow | Could speed up agent logic dev |
+| Production eval | Not representative | Azure VMs remain primary |
+
+**Recommendation**: Adopt shell apps for regression testing agent logic, but never train on them. Real behavioral data from enterprise workflows remains the moat.
+
+### 10.6 Bottom Line
+
+The 19.5% WAA success rate validates OpenAdapt's approach:
+- Windows enterprise automation is hard
+- Current agents fail often
+- Learning from real human demonstrations is one path to improvement
+
+Cua's strength (macOS VMs at 97% native speed) doesn't help with SAP, Citrix, or legacy Win32 apps—exactly where OpenAdapt focuses.
+
+---
+
+## 12. Appendix: Agent Loop Types in Cua
+
+Cua provides multiple agent loop implementations optimized for different use cases:
+
+| Agent Loop | Best For | Model Support |
+|------------|----------|---------------|
+| **AgentLoop.OPENAI** | Web-based tasks, browser automation | OpenAI models (requires Tier 3 access) |
+| **AgentLoop.ANTHROPIC** | Strong reasoning + computer-use | claude-3-5-sonnet, claude-3-7-sonnet |
+| **AgentLoop.UITARS** | OS/desktop tasks, latency-sensitive | UI-TARS-1.5 (local or HuggingFace) |
+| **AgentLoop.OMNI** | Maximum flexibility | Any vision-language model |
+
+### Composite Agent Example
+
+```python
+# Pair a grounding model with a reasoning model
+model = "huggingface-local/GTA1-7B+openai/gpt-4o"
+# GTA1-7B: precise click coordinates
+# GPT-4o: action planning and reasoning
+```
+
+---
+
+## 13. Appendix: OpenAdapt-ML Docker Setup Details
+
+Our current implementation uses a custom Dockerfile that:
+
+1. **Base**: `dockurr/windows:latest` (modern Windows ISO auto-download)
+2. **WAA Components**: Copied from `windowsarena/winarena:latest`
+3. **IP Patching**: Changes `20.20.20.21` to `172.30.0.2` for dockurr compatibility
+4. **Python**: Uses Python 3.9 from vanilla WAA for GroundingDINO compatibility
+5. **Automation**: FirstLogonCommands for firewall, WAA server auto-start
+
+Key environment variables:
+- `VERSION=11e` - Windows 11 Enterprise Evaluation
+- `RAM_SIZE=8G` / `16G` (fast mode)
+- `CPU_CORES=4` / `6` (fast mode)
+
+---
+
+## References
+
+- [Cua GitHub Repository](https://github.com/trycua/cua)
+- [Cua-Bench HuggingFace Blog](https://huggingface.co/blog/cua-ai/cua-bench)
+- [Show HN: Cua-Bench Discussion](https://news.ycombinator.com/item?id=46768906)
+- [Launch HN: Cua (YC X25)](https://news.ycombinator.com/item?id=43773563)
+- [Cua Documentation](https://cua.ai/docs)
+- [Cua Composite Agents Blog](https://www.trycua.com/blog/composite-agents)
+- [What is Lume?](https://cua.ai/docs/lume/guide/getting-started/introduction)
+- [OSWorld-Verified](https://xlang.ai/blog/osworld-verified)
+- [Windows Agent Arena](https://microsoft.github.io/WindowsAgentArena/)
+- [Windows Agent Arena Paper](https://arxiv.org/abs/2409.08264)
+- [OpenAI Computer-Using Agent](https://openai.com/index/computer-using-agent/)
+- [OpenAdapt REPO_CONSOLIDATION_PLAN.md](/Users/abrichr/oa/src/openadapt-ml/docs/REPO_CONSOLIDATION_PLAN.md)
diff --git a/docs/waa_speedup_options.md b/docs/waa_speedup_options.md
new file mode 100644
index 0000000..1ba5de1
--- /dev/null
+++ b/docs/waa_speedup_options.md
@@ -0,0 +1,94 @@
+# WAA Speedup Options
+
+## Summary Table
+
+| Option | Speedup | Cost Impact | Recommended |
+|--------|---------|-------------|-------------|
+| `--fast` flag | ~30% install, ~40% eval | +$0.19/hr | YES for dev |
+| Deallocated VM | Skip 25min install | ~$1.50/mo | YES for repeat runs |
+| Parallelization | 45x (154 tasks) | -78% cost | YES for large benchmarks |
+
+## Option 1: `--fast` Flag (Double Hardware)
+
+Use larger VM with more CPU/RAM allocated to QEMU.
+
+**Usage:**
+```bash
+# Create fast VM
+uv run python -m openadapt_ml.benchmarks.cli create --fast
+
+# Start with fast QEMU allocation
+uv run python -m openadapt_ml.benchmarks.cli start --fast
+```
+
+**Specs:**
+
+| Mode | VM Size | vCPU | RAM | QEMU Cores | QEMU RAM | Cost/hr |
+|------|---------|------|-----|------------|----------|---------|
+| Standard | D4ds_v4 | 4 | 16GB | 4 | 8GB | $0.19 |
+| Fast | D8ds_v5 | 8 | 32GB | 6 | 16GB | $0.38 |
+
+**Expected Speedups:**
+- Windows installation: ~30% faster (25min → ~18min)
+- Task evaluation: ~40% faster (navi agent ML inference benefits from more CPU)
+- Total benchmark (30 tasks): ~35% faster
+
+**When to use:**
+- Development/debugging when you don't want to wait
+- Time-sensitive evaluations
+- Cost difference is negligible (~$0.19/hr extra)
+
+## Option 2: Deallocated "Golden" VM
+
+Keep a VM deallocated after WAA is fully installed. Restart when needed.
+
+**How it works:**
+1. First run: Create VM, install WAA fully (~25 min)
+2. After use: `deallocate` (stops billing, keeps disk)
+3. Next time: `vm-start` → boots in ~2-3 min with WAA ready
+
+**Cost:**
+- Deallocated VM: $0 compute
+- Disk storage: ~$0.05/GB/month = ~$1.50/month for 30GB
+
+**Commands:**
+```bash
+# After first successful run
+uv run python -m openadapt_ml.benchmarks.cli deallocate
+
+# Next time
+uv run python -m openadapt_ml.benchmarks.cli vm-start
+uv run python -m openadapt_ml.benchmarks.cli start  # Container starts, Windows boots in 2-3 min
+```
+
+## Option 3: Parallelization (Best for Large Benchmarks)
+
+Run multiple VMs in parallel for large task sets.
+
+**Speedup for 154 tasks:**
+
+| Workers | Time | Cost | vs Single VM |
+|---------|------|------|--------------|
+| 1 (sequential) | ~15 hours | $2.88 | baseline |
+| 5 | ~3 hours | $1.14 | 5x faster, 60% cheaper |
+| 10 | ~1.5 hours | $0.63 | 10x faster, 78% cheaper |
+
+**Implementation:** See `docs/waa_parallelization_plan.md`
+
+## Quick Reference
+
+```bash
+# Standard mode (default)
+uv run python -m openadapt_ml.benchmarks.cli create
+uv run python -m openadapt_ml.benchmarks.cli build
+uv run python -m openadapt_ml.benchmarks.cli start
+
+# Fast mode (double hardware)
+uv run python -m openadapt_ml.benchmarks.cli create --fast
+uv run python -m openadapt_ml.benchmarks.cli build
+uv run python -m openadapt_ml.benchmarks.cli start --fast
+
+# Reuse deallocated VM
+uv run python -m openadapt_ml.benchmarks.cli vm-start
+uv run python -m openadapt_ml.benchmarks.cli start
+```
diff --git a/openadapt_ml/benchmarks/__init__.py b/openadapt_ml/benchmarks/__init__.py
index f3e40e3..138df99 100644
--- a/openadapt_ml/benchmarks/__init__.py
+++ b/openadapt_ml/benchmarks/__init__.py
@@ -1,177 +1,31 @@
 """Benchmark integration for openadapt-ml.
 
-DEPRECATION NOTICE:
-    The canonical benchmark code is now in the `openadapt-evals` package.
-    For new projects, prefer importing from `openadapt_evals`:
+This module provides ML-specific agents for benchmark evaluation.
+These agents wrap openadapt-ml internals (trained policies, API adapters).
 
+For benchmark infrastructure (adapters, runners, viewers), use openadapt-evals:
     ```python
-    # Preferred (standalone, no openadapt-ml dependency)
-    from openadapt_evals import ApiAgent, WAAMockAdapter, evaluate_agent_on_benchmark
-
-    # Still supported (uses openadapt-ml internals)
-    from openadapt_ml.benchmarks import PolicyAgent, APIBenchmarkAgent
-    ```
-
-    The following are ONLY available in openadapt-ml (they wrap openadapt-ml internals):
-    - PolicyAgent (wraps openadapt_ml.runtime.policy.AgentPolicy)
-    - APIBenchmarkAgent (wraps openadapt_ml.models.api_adapter.ApiVLMAdapter)
-
-    The following should be imported from openadapt-evals:
-    - ApiAgent (standalone, P0 demo persistence fix)
-    - All adapter classes (WAAAdapter, WAALiveAdapter, etc.)
-    - Base classes (BenchmarkAdapter, BenchmarkTask, etc.)
-    - Evaluation utilities (evaluate_agent_on_benchmark, compute_metrics)
-
-This module provides interfaces and utilities for evaluating GUI agents
-on standardized benchmarks like Windows Agent Arena (WAA), OSWorld,
-WebArena, and others.
-
-Core classes:
-    - BenchmarkAdapter: Abstract interface for benchmark integration
-    - BenchmarkAgent: Abstract interface for agents to be evaluated
-    - BenchmarkTask, BenchmarkObservation, BenchmarkAction: Data classes
-
-Agent implementations:
-    - PolicyAgent: Wraps openadapt-ml AgentPolicy
-    - APIBenchmarkAgent: Uses hosted VLM APIs (Claude, GPT-5.1) via openadapt-ml adapters
-    - ScriptedAgent: Follows predefined action sequence
-    - RandomAgent: Takes random actions (baseline)
-
-Evaluation:
-    - evaluate_agent_on_benchmark: Run agent on benchmark tasks
-    - compute_metrics: Compute aggregate metrics from results
-
-Example:
-    ```python
-    from openadapt_ml.benchmarks import (
-        BenchmarkAdapter,
-        PolicyAgent,
-        APIBenchmarkAgent,
+    from openadapt_evals import (
+        WAAMockAdapter,
+        WAALiveAdapter,
         evaluate_agent_on_benchmark,
-        compute_metrics,
     )
-
-    # Create adapter for specific benchmark
-    adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
-
-    # Wrap policy as benchmark agent
-    agent = PolicyAgent(policy)
-
-    # Or use API-backed agent for baselines
-    agent = APIBenchmarkAgent(provider="anthropic")  # Claude
-    agent = APIBenchmarkAgent(provider="openai")     # GPT-5.1
-
-    # Run evaluation
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
-
-    # Compute metrics
-    metrics = compute_metrics(results)
-    print(f"Success rate: {metrics['success_rate']:.1%}")
     ```
-"""
-
-import warnings
 
-# Emit deprecation warning on import
-warnings.warn(
-    "openadapt_ml.benchmarks is deprecated. "
-    "Please use openadapt_evals for standalone benchmark evaluation. "
-    "See CLAUDE.md for migration guide.",
-    DeprecationWarning,
-    stacklevel=2,
-)
+ML-specific agents (only available in openadapt-ml):
+    - PolicyAgent: Wraps openadapt_ml.runtime.policy.AgentPolicy
+    - APIBenchmarkAgent: Uses openadapt_ml.models.api_adapter.ApiVLMAdapter
+    - UnifiedBaselineAgent: Uses openadapt_ml.baselines adapters
+"""
 
-# ruff: noqa: E402
-# Imports after warning call are intentional
 from openadapt_ml.benchmarks.agent import (
     APIBenchmarkAgent,
-    BenchmarkAgent,
     PolicyAgent,
-    RandomAgent,
-    ScriptedAgent,
-    SmartMockAgent,
     UnifiedBaselineAgent,
 )
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkAdapter,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-    StaticDatasetAdapter,
-    UIElement,
-)
-from openadapt_ml.benchmarks.runner import (
-    EvaluationConfig,
-    compute_domain_metrics,
-    compute_metrics,
-    evaluate_agent_on_benchmark,
-)
-from openadapt_ml.benchmarks.waa import WAAAdapter, WAAConfig, WAAMockAdapter
-from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
-from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer
-
-
-# Azure orchestration (lazy import to avoid requiring azure-ai-ml)
-def _get_azure_classes():
-    from openadapt_ml.benchmarks.azure import (
-        AzureConfig,
-        AzureWAAOrchestrator,
-        estimate_cost,
-    )
-
-    return AzureConfig, AzureWAAOrchestrator, estimate_cost
-
 
 __all__ = [
-    # Base classes
-    "BenchmarkAdapter",
-    "BenchmarkTask",
-    "BenchmarkObservation",
-    "BenchmarkAction",
-    "BenchmarkResult",
-    "StaticDatasetAdapter",
-    "UIElement",
-    # Agents
-    "BenchmarkAgent",
     "PolicyAgent",
     "APIBenchmarkAgent",
     "UnifiedBaselineAgent",
-    "ScriptedAgent",
-    "RandomAgent",
-    "SmartMockAgent",
-    # Evaluation
-    "EvaluationConfig",
-    "evaluate_agent_on_benchmark",
-    "compute_metrics",
-    "compute_domain_metrics",
-    # WAA
-    "WAAAdapter",
-    "WAAConfig",
-    "WAAMockAdapter",
-    "WAALiveAdapter",
-    "WAALiveConfig",
-    # Viewer
-    "generate_benchmark_viewer",
-    # Azure (lazy-loaded)
-    "AzureConfig",
-    "AzureWAAOrchestrator",
-    "estimate_cost",
 ]
-
-
-# Lazy loading for Azure classes (avoids requiring azure-ai-ml for basic usage)
-def __getattr__(name: str):
-    if name in ("AzureConfig", "AzureWAAOrchestrator", "estimate_cost"):
-        from openadapt_ml.benchmarks.azure import (
-            AzureConfig,
-            AzureWAAOrchestrator,
-            estimate_cost,
-        )
-
-        return {
-            "AzureConfig": AzureConfig,
-            "AzureWAAOrchestrator": AzureWAAOrchestrator,
-            "estimate_cost": estimate_cost,
-        }[name]
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/openadapt_ml/benchmarks/agent.py b/openadapt_ml/benchmarks/agent.py
index aea4d9b..c97d63f 100644
--- a/openadapt_ml/benchmarks/agent.py
+++ b/openadapt_ml/benchmarks/agent.py
@@ -1,8 +1,15 @@
-"""Agent interface for benchmark evaluation.
+"""ML-specific agents for benchmark evaluation.
 
-This module provides the BenchmarkAgent interface that agents must implement
-to be evaluated on benchmarks, plus adapters to wrap existing openadapt-ml
-components.
+This module provides agents that wrap openadapt-ml components (VLM adapters,
+policies, baselines) for benchmark evaluation.
+
+For standalone agents without ML dependencies, use openadapt_evals:
+    from openadapt_evals import ApiAgent, ScriptedAgent, RandomAgent
+
+ML-specific agents in this module:
+    - PolicyAgent: Wraps openadapt_ml.runtime.policy.AgentPolicy
+    - APIBenchmarkAgent: Uses openadapt_ml.models.api_adapter.ApiVLMAdapter
+    - UnifiedBaselineAgent: Uses openadapt_ml.baselines adapters
 
 Example:
     from openadapt_ml.benchmarks import PolicyAgent
@@ -12,7 +19,7 @@
     agent = PolicyAgent(policy)
     results = evaluate_agent_on_benchmark(agent, benchmark_adapter)
 
-    # API-backed agents (GPT-5.1, Claude)
+    # API-backed agents (GPT-5.1, Claude) using openadapt-ml adapters
     from openadapt_ml.benchmarks import APIBenchmarkAgent
 
     agent = APIBenchmarkAgent(provider="anthropic")  # Uses Claude
@@ -23,11 +30,12 @@
 from __future__ import annotations
 
 import re
-from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any
 
-from openadapt_ml.benchmarks.base import (
+# Import base classes from openadapt-evals (canonical location)
+from openadapt_evals import (
     BenchmarkAction,
+    BenchmarkAgent,
     BenchmarkObservation,
     BenchmarkTask,
 )
@@ -38,42 +46,6 @@
     from openadapt_ml.schema import Action
 
 
-class BenchmarkAgent(ABC):
-    """Abstract interface for agents evaluated on benchmarks.
-
-    Agents must implement the `act` method to receive observations
-    and return actions. The agent can maintain internal state across
-    steps within an episode.
-    """
-
-    @abstractmethod
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Given observation and task, return next action.
-
-        Args:
-            observation: Current observation from the environment.
-            task: Task being performed.
-            history: Optional list of previous (observation, action) pairs.
-
-        Returns:
-            Action to execute.
-        """
-        pass
-
-    def reset(self) -> None:
-        """Reset agent state between episodes.
-
-        Called before starting a new task. Override to clear any
-        internal state.
-        """
-        pass
-
-
 class PolicyAgent(BenchmarkAgent):
     """Wraps openadapt-ml AgentPolicy for benchmark evaluation.
 
@@ -127,61 +99,37 @@ def _build_sample(
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None,
     ) -> dict:
-        """Build SFT-style sample from benchmark observation.
-
-        Args:
-            observation: Current observation.
-            task: Current task.
-            history: Action history.
-
-        Returns:
-            Sample dict with 'images' and 'messages'.
-        """
-        # Build user message content
+        """Build SFT-style sample from benchmark observation."""
         content_parts = [f"Goal: {task.instruction}"]
 
-        # Add accessibility tree if available and enabled
         if self.use_accessibility_tree and observation.accessibility_tree:
             tree_str = self._format_accessibility_tree(observation.accessibility_tree)
             content_parts.append(f"UI Elements:\n{tree_str}")
 
-        # Add context
         if observation.url:
             content_parts.append(f"URL: {observation.url}")
         if observation.window_title:
             content_parts.append(f"Window: {observation.window_title}")
 
-        # Add history if enabled
         if self.use_history and history:
             history_str = self._format_history(history)
             content_parts.append(f"Previous actions:\n{history_str}")
 
         content_parts.append("What action should be taken next?")
 
-        # Build sample
         sample = {
             "messages": [
                 {"role": "user", "content": "\n\n".join(content_parts)},
             ],
         }
 
-        # Add image if available
         if observation.screenshot_path:
             sample["images"] = [observation.screenshot_path]
 
         return sample
 
     def _format_accessibility_tree(self, tree: dict, indent: int = 0) -> str:
-        """Format accessibility tree for prompt.
-
-        Args:
-            tree: Accessibility tree dict.
-            indent: Current indentation level.
-
-        Returns:
-            Formatted string representation.
-        """
-        # Simple formatting - can be overridden for platform-specific formatting
+        """Format accessibility tree for prompt."""
         lines = []
         prefix = "  " * indent
 
@@ -202,29 +150,15 @@ def _format_accessibility_tree(self, tree: dict, indent: int = 0) -> str:
     def _format_history(
         self, history: list[tuple[BenchmarkObservation, BenchmarkAction]]
     ) -> str:
-        """Format action history for prompt.
-
-        Args:
-            history: List of (observation, action) pairs.
-
-        Returns:
-            Formatted string.
-        """
+        """Format action history for prompt."""
         lines = []
-        for i, (obs, action) in enumerate(history[-5:], 1):  # Last 5 actions
+        for i, (obs, action) in enumerate(history[-5:], 1):
             action_str = self._action_to_string(action)
             lines.append(f"{i}. {action_str}")
         return "\n".join(lines)
 
     def _action_to_string(self, action: BenchmarkAction) -> str:
-        """Convert BenchmarkAction to string representation.
-
-        Args:
-            action: Action to convert.
-
-        Returns:
-            String representation.
-        """
+        """Convert BenchmarkAction to string representation."""
         if action.type == "click":
             if action.target_name:
                 return f"CLICK({action.target_name})"
@@ -249,31 +183,19 @@ def _action_to_string(self, action: BenchmarkAction) -> str:
     def _to_benchmark_action(
         self, action: Action, thought: str | None
     ) -> BenchmarkAction:
-        """Convert openadapt-ml Action to BenchmarkAction.
-
-        Args:
-            action: Action from policy.
-            thought: Optional thought/reasoning.
-
-        Returns:
-            BenchmarkAction.
-        """
-        # Extract normalized coordinates
+        """Convert openadapt-ml Action to BenchmarkAction."""
         x, y = None, None
         if action.normalized_coordinates is not None:
             x, y = action.normalized_coordinates
 
-        # Extract end coordinates for drag
         end_x, end_y = None, None
         if action.normalized_end is not None:
             end_x, end_y = action.normalized_end
 
-        # Extract action type value (enum -> string)
         action_type = (
             action.type.value if hasattr(action.type, "value") else action.type
         )
 
-        # Extract element info if available
         target_node_id = None
         target_role = None
         target_name = None
@@ -311,192 +233,28 @@ def _to_benchmark_action(
 
     def reset(self) -> None:
         """Reset agent state."""
-        # PolicyAgent is stateless, nothing to reset
-        pass
-
-
-class ScriptedAgent(BenchmarkAgent):
-    """Agent that follows a predefined script of actions.
-
-    Useful for testing benchmark adapters or replaying trajectories.
-
-    Args:
-        actions: List of actions to execute in order.
-    """
-
-    def __init__(self, actions: list[BenchmarkAction]):
-        self.actions = actions
-        self._step = 0
-
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Return the next scripted action.
-
-        Args:
-            observation: Ignored.
-            task: Ignored.
-            history: Ignored.
-
-        Returns:
-            Next action from script, or DONE if script exhausted.
-        """
-        if self._step < len(self.actions):
-            action = self.actions[self._step]
-            self._step += 1
-            return action
-        return BenchmarkAction(type="done")
-
-    def reset(self) -> None:
-        """Reset step counter."""
-        self._step = 0
-
-
-class RandomAgent(BenchmarkAgent):
-    """Agent that takes random actions.
-
-    Useful for baseline comparisons.
-
-    Args:
-        action_types: List of action types to randomly select from.
-        seed: Random seed for reproducibility.
-    """
-
-    def __init__(
-        self,
-        action_types: list[str] | None = None,
-        seed: int | None = None,
-    ):
-        import random
-
-        self.action_types = action_types or ["click", "type", "scroll", "done"]
-        self.rng = random.Random(seed)
-
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Return a random action.
-
-        Args:
-            observation: Used to get viewport bounds.
-            task: Ignored.
-            history: Used to decide when to stop.
-
-        Returns:
-            Random action.
-        """
-        # Stop after many actions
-        if history and len(history) > 20:
-            return BenchmarkAction(type="done")
-
-        action_type = self.rng.choice(self.action_types)
-
-        if action_type == "click":
-            return BenchmarkAction(
-                type="click",
-                x=self.rng.random(),
-                y=self.rng.random(),
-            )
-        elif action_type == "type":
-            return BenchmarkAction(
-                type="type",
-                text="test",
-            )
-        elif action_type == "scroll":
-            return BenchmarkAction(
-                type="scroll",
-                scroll_direction=self.rng.choice(["up", "down"]),
-            )
-        else:
-            return BenchmarkAction(type="done")
-
-    def reset(self) -> None:
-        """Nothing to reset."""
         pass
 
 
-class SmartMockAgent(BenchmarkAgent):
-    """Agent designed to pass WAAMockAdapter evaluation.
-
-    Performs a fixed sequence of actions that satisfy the mock adapter's
-    success criteria. Use for validating the benchmark pipeline locally.
-
-    The mock adapter evaluates success based on:
-    - Clicking Submit (ID 4) - primary success path
-    - Typing something AND clicking OK (ID 1) - form submission path
-    - Calling DONE after at least 2 actions - reasonable completion
-
-    This agent clicks Submit (ID 4) which is the simplest success path.
-    """
-
-    def __init__(self):
-        """Initialize the agent."""
-        self._step = 0
-        # Simple action sequence: click Submit button (ID 4), then done
-        self._actions = [
-            BenchmarkAction(type="click", target_node_id="4"),  # Click Submit
-            BenchmarkAction(type="done"),
-        ]
-
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Return the next scripted action.
-
-        Args:
-            observation: Ignored.
-            task: Ignored.
-            history: Ignored.
-
-        Returns:
-            Next action from script, or DONE if script exhausted.
-        """
-        if self._step < len(self._actions):
-            action = self._actions[self._step]
-            self._step += 1
-            return action
-        return BenchmarkAction(type="done")
-
-    def reset(self) -> None:
-        """Reset step counter."""
-        self._step = 0
-
-
 class APIBenchmarkAgent(BenchmarkAgent):
-    """Agent that uses hosted VLM APIs (Claude, GPT-5.1) for benchmark evaluation.
+    """Agent that uses hosted VLM APIs via openadapt-ml ApiVLMAdapter.
 
     This agent wraps ApiVLMAdapter to provide Claude or GPT-5.1 baselines
     for benchmark evaluation. It converts BenchmarkObservation to the
     API format and parses VLM responses into BenchmarkActions.
 
+    Note: For standalone API evaluation without openadapt-ml, use
+    openadapt_evals.ApiAgent instead (has P0 demo persistence fix).
+
     Args:
         provider: API provider - "anthropic" (Claude) or "openai" (GPT-5.1).
         api_key: Optional API key override. If not provided, uses env vars.
-        model: Optional model name override. Defaults to provider's best VLM.
+        model: Optional model name override.
         max_tokens: Maximum tokens for VLM response.
         use_accessibility_tree: Whether to include accessibility tree in prompt.
         use_history: Whether to include action history in prompt.
-
-    Example:
-        # Claude baseline
-        agent = APIBenchmarkAgent(provider="anthropic")
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
-
-        # GPT-5.1 baseline
-        agent = APIBenchmarkAgent(provider="openai")
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
     """
 
-    # System prompt for GUI automation
     SYSTEM_PROMPT = """You are a GUI automation agent. Given a screenshot and task instruction, determine the next action to take.
 
 Available actions:
@@ -506,7 +264,7 @@ class APIBenchmarkAgent(BenchmarkAgent):
 - KEY(key) - Press a key (e.g., Enter, Tab, Escape)
 - KEY(modifier+key) - Press key combination (e.g., Ctrl+c, Alt+Tab)
 - SCROLL(direction) - Scroll up or down
-- DRAG(x1, y1, x2, y2) - Drag from (x1,y1) to (x2,y2) (pixel or normalized)
+- DRAG(x1, y1, x2, y2) - Drag from (x1,y1) to (x2,y2)
 - DONE() - Task is complete
 - ANSWER("response") - For QA tasks, provide the answer
 
@@ -555,32 +313,15 @@ def act(
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
     ) -> BenchmarkAction:
-        """Use VLM API to determine next action.
-
-        Args:
-            observation: Current observation with screenshot.
-            task: Task being performed.
-            history: Previous observations and actions.
-
-        Returns:
-            BenchmarkAction parsed from VLM response.
-        """
+        """Use VLM API to determine next action."""
         adapter = self._get_adapter()
-
-        # Build the sample for the API
         sample = self._build_sample(observation, task, history)
 
-        # Call the VLM API
         try:
             response = adapter.generate(sample, max_new_tokens=self.max_tokens)
         except Exception as e:
-            # On API error, return done to avoid infinite loops
-            return BenchmarkAction(
-                type="done",
-                raw_action={"error": str(e)},
-            )
+            return BenchmarkAction(type="done", raw_action={"error": str(e)})
 
-        # Parse the response into a BenchmarkAction
         return self._parse_response(response, observation)
 
     def _build_sample(
@@ -589,41 +330,26 @@ def _build_sample(
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None,
     ) -> dict[str, Any]:
-        """Build API sample from benchmark observation.
-
-        Args:
-            observation: Current observation.
-            task: Current task.
-            history: Action history.
-
-        Returns:
-            Sample dict with 'images' and 'messages'.
-        """
-        # Build user message content
+        """Build API sample from benchmark observation."""
         content_parts = [f"GOAL: {task.instruction}"]
 
-        # Add context
         if observation.url:
             content_parts.append(f"URL: {observation.url}")
         if observation.window_title:
             content_parts.append(f"Window: {observation.window_title}")
 
-        # Add accessibility tree if available and enabled
         if self.use_accessibility_tree and observation.accessibility_tree:
             tree_str = self._format_accessibility_tree(observation.accessibility_tree)
-            # Truncate if too long
             if len(tree_str) > 4000:
                 tree_str = tree_str[:4000] + "\n... (truncated)"
             content_parts.append(f"UI Elements:\n{tree_str}")
 
-        # Add history if enabled
         if self.use_history and history:
             history_str = self._format_history(history)
             content_parts.append(f"Previous actions:\n{history_str}")
 
         content_parts.append("\nWhat is the next action?")
 
-        # Build sample
         sample: dict[str, Any] = {
             "messages": [
                 {"role": "system", "content": self.SYSTEM_PROMPT},
@@ -631,22 +357,13 @@ def _build_sample(
             ],
         }
 
-        # Add image if available
         if observation.screenshot_path:
             sample["images"] = [observation.screenshot_path]
 
         return sample
 
     def _format_accessibility_tree(self, tree: dict, indent: int = 0) -> str:
-        """Format accessibility tree for prompt.
-
-        Args:
-            tree: Accessibility tree dict.
-            indent: Current indentation level.
-
-        Returns:
-            Formatted string representation.
-        """
+        """Format accessibility tree for prompt."""
         lines = []
         prefix = "  " * indent
 
@@ -667,29 +384,15 @@ def _format_accessibility_tree(self, tree: dict, indent: int = 0) -> str:
     def _format_history(
         self, history: list[tuple[BenchmarkObservation, BenchmarkAction]]
     ) -> str:
-        """Format action history for prompt.
-
-        Args:
-            history: List of (observation, action) pairs.
-
-        Returns:
-            Formatted string.
-        """
+        """Format action history for prompt."""
         lines = []
-        for i, (obs, action) in enumerate(history[-5:], 1):  # Last 5 actions
+        for i, (obs, action) in enumerate(history[-5:], 1):
             action_str = self._action_to_string(action)
             lines.append(f"{i}. {action_str}")
         return "\n".join(lines)
 
     def _action_to_string(self, action: BenchmarkAction) -> str:
-        """Convert BenchmarkAction to string representation.
-
-        Args:
-            action: Action to convert.
-
-        Returns:
-            String representation.
-        """
+        """Convert BenchmarkAction to string representation."""
         if action.type == "click":
             if action.target_node_id:
                 return f"CLICK([{action.target_node_id}])"
@@ -718,32 +421,14 @@ def _action_to_string(self, action: BenchmarkAction) -> str:
     def _parse_response(
         self, response: str, observation: BenchmarkObservation | None = None
     ) -> BenchmarkAction:
-        """Parse VLM response into BenchmarkAction.
-
-        Handles various response formats:
-        - ACTION: CLICK(0.5, 0.3)
-        - CLICK(0.5, 0.3)
-        - I'll click at coordinates (0.5, 0.3) -> CLICK(0.5, 0.3)
-
-        Args:
-            response: Raw VLM response text.
-            observation: Current observation (used for coordinate normalization).
-
-        Returns:
-            Parsed BenchmarkAction.
-        """
-        # Store raw response for debugging
+        """Parse VLM response into BenchmarkAction."""
         raw_action = {"response": response}
 
-        # Extract action line (look for ACTION: prefix or action pattern)
         action_line = None
-
-        # Try to find ACTION: prefix
         action_match = re.search(r"ACTION:\s*(.+)", response, re.IGNORECASE)
         if action_match:
             action_line = action_match.group(1).strip()
         else:
-            # Look for action pattern anywhere in response
             patterns = [
                 r"(CLICK\s*\([^)]+\))",
                 r"(TYPE\s*\([^)]+\))",
@@ -760,107 +445,76 @@ def _parse_response(
                     break
 
         if not action_line:
-            # Could not parse action, return done
             raw_action["parse_error"] = "No action pattern found"
             return BenchmarkAction(type="done", raw_action=raw_action)
 
-        # Parse CLICK action
+        # Parse CLICK([id])
         click_match = re.match(
             r"CLICK\s*\(\s*\[?(\d+)\]?\s*\)", action_line, re.IGNORECASE
         )
         if click_match:
-            # CLICK([id]) - element ID
             node_id = click_match.group(1)
             return BenchmarkAction(
-                type="click",
-                target_node_id=node_id,
-                raw_action=raw_action,
+                type="click", target_node_id=node_id, raw_action=raw_action
             )
 
+        # Parse CLICK(x, y)
         click_coords = re.match(
             r"CLICK\s*\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)", action_line, re.IGNORECASE
         )
         if click_coords:
-            # CLICK(x, y) - coordinates
             x = float(click_coords.group(1))
             y = float(click_coords.group(2))
-
-            # Normalize coordinates if they appear to be pixel values
-            # If x or y > 1.0, assume pixel coordinates and normalize using viewport
             if observation and observation.viewport and (x > 1.0 or y > 1.0):
                 width, height = observation.viewport
-                x_norm = x / width
-                y_norm = y / height
                 raw_action["original_coords"] = {"x": x, "y": y}
                 raw_action["normalized"] = True
-                x = x_norm
-                y = y_norm
-
-            return BenchmarkAction(
-                type="click",
-                x=x,
-                y=y,
-                raw_action=raw_action,
-            )
+                x, y = x / width, y / height
+            return BenchmarkAction(type="click", x=x, y=y, raw_action=raw_action)
 
-        # Parse TYPE action
+        # Parse TYPE
         type_match = re.match(
             r"TYPE\s*\(\s*[\"'](.+?)[\"']\s*\)", action_line, re.IGNORECASE
         )
         if type_match:
-            text = type_match.group(1)
             return BenchmarkAction(
-                type="type",
-                text=text,
-                raw_action=raw_action,
+                type="type", text=type_match.group(1), raw_action=raw_action
             )
 
-        # Parse KEY action
+        # Parse KEY
         key_match = re.match(r"KEY\s*\(\s*(.+?)\s*\)", action_line, re.IGNORECASE)
         if key_match:
             key_str = key_match.group(1)
-            # Handle modifier+key format
             if "+" in key_str:
                 parts = key_str.split("+")
-                key = parts[-1]
-                modifiers = parts[:-1]
                 return BenchmarkAction(
                     type="key",
-                    key=key,
-                    modifiers=modifiers,
+                    key=parts[-1],
+                    modifiers=parts[:-1],
                     raw_action=raw_action,
                 )
-            return BenchmarkAction(
-                type="key",
-                key=key_str,
-                raw_action=raw_action,
-            )
+            return BenchmarkAction(type="key", key=key_str, raw_action=raw_action)
 
-        # Parse SCROLL action
+        # Parse SCROLL
         scroll_match = re.match(
             r"SCROLL\s*\(\s*(up|down)\s*\)", action_line, re.IGNORECASE
         )
         if scroll_match:
-            direction = scroll_match.group(1).lower()
             return BenchmarkAction(
                 type="scroll",
-                scroll_direction=direction,
+                scroll_direction=scroll_match.group(1).lower(),
                 raw_action=raw_action,
             )
 
-        # Parse DRAG action
+        # Parse DRAG
         drag_match = re.match(
             r"DRAG\s*\(\s*([\d.]+)\s*,\s*([\d.]+)\s*,\s*([\d.]+)\s*,\s*([\d.]+)\s*\)",
             action_line,
             re.IGNORECASE,
         )
         if drag_match:
-            x = float(drag_match.group(1))
-            y = float(drag_match.group(2))
-            end_x = float(drag_match.group(3))
-            end_y = float(drag_match.group(4))
-
-            # Normalize coordinates if they appear to be pixel values
+            x, y = float(drag_match.group(1)), float(drag_match.group(2))
+            end_x, end_y = float(drag_match.group(3)), float(drag_match.group(4))
             if (
                 observation
                 and observation.viewport
@@ -874,88 +528,51 @@ def _parse_response(
                     "end_y": end_y,
                 }
                 raw_action["normalized"] = True
-                x = x / width
-                y = y / height
-                end_x = end_x / width
-                end_y = end_y / height
-
+                x, y, end_x, end_y = (
+                    x / width,
+                    y / height,
+                    end_x / width,
+                    end_y / height,
+                )
             return BenchmarkAction(
-                type="drag",
-                x=x,
-                y=y,
-                end_x=end_x,
-                end_y=end_y,
-                raw_action=raw_action,
+                type="drag", x=x, y=y, end_x=end_x, end_y=end_y, raw_action=raw_action
             )
 
-        # Parse DONE action
+        # Parse DONE
         if re.match(r"DONE\s*\(\s*\)", action_line, re.IGNORECASE):
             return BenchmarkAction(type="done", raw_action=raw_action)
 
-        # Parse ANSWER action
+        # Parse ANSWER
         answer_match = re.match(
             r"ANSWER\s*\(\s*[\"'](.+?)[\"']\s*\)", action_line, re.IGNORECASE
         )
         if answer_match:
-            answer = answer_match.group(1)
             return BenchmarkAction(
-                type="answer",
-                answer=answer,
-                raw_action=raw_action,
+                type="answer", answer=answer_match.group(1), raw_action=raw_action
             )
 
-        # Unknown action format
         raw_action["parse_error"] = f"Unknown action format: {action_line}"
         return BenchmarkAction(type="done", raw_action=raw_action)
 
     def reset(self) -> None:
         """Reset agent state."""
-        # APIBenchmarkAgent is stateless, nothing to reset
         pass
 
 
 class UnifiedBaselineAgent(BenchmarkAgent):
-    """Agent that uses the UnifiedBaselineAdapter for benchmark evaluation.
-
-    This agent provides a unified interface for comparing Claude, GPT, and Gemini
-    models across multiple evaluation tracks (coordinates, ReAct, SoM).
+    """Agent that uses UnifiedBaselineAdapter for benchmark evaluation.
 
-    Compared to APIBenchmarkAgent, this agent:
-    - Uses the new provider abstraction (models/providers/)
-    - Supports multiple tracks (A, B, C) with track-specific prompts
-    - Uses the unified response parser
-    - Supports model aliases for easy switching
+    Provides unified interface for Claude, GPT, and Gemini baselines
+    across multiple tracks (A: coordinates, B: ReAct, C: SoM).
 
     Args:
-        model_alias: Model alias (e.g., 'claude-opus-4.5', 'gpt-5.2', 'gemini-3-pro').
+        model_alias: Model alias (e.g., 'claude-opus-4.5', 'gpt-5.2').
         track: Track type ('A', 'B', or 'C'). Defaults to 'A'.
-        api_key: Optional API key override. If not provided, uses env vars.
-        temperature: Sampling temperature. Defaults to 0.1.
-        max_tokens: Maximum tokens for response. Defaults to 1024.
-        demo: Optional demo text to include in prompts.
-        verbose: Whether to print verbose debug output.
-
-    Example:
-        # Claude baseline with Track C (Set-of-Mark)
-        agent = UnifiedBaselineAgent(
-            model_alias="claude-opus-4.5",
-            track="C",
-        )
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
-
-        # GPT baseline with Track A (direct coordinates)
-        agent = UnifiedBaselineAgent(
-            model_alias="gpt-5.2",
-            track="A",
-        )
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
-
-        # Gemini baseline with Track B (ReAct reasoning)
-        agent = UnifiedBaselineAgent(
-            model_alias="gemini-3-pro",
-            track="B",
-        )
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
+        api_key: Optional API key override.
+        temperature: Sampling temperature.
+        max_tokens: Maximum tokens for response.
+        demo: Optional demo text for prompts.
+        verbose: Whether to print debug output.
     """
 
     def __init__(
@@ -980,12 +597,8 @@ def __init__(
     def _get_adapter(self):
         """Lazily initialize the UnifiedBaselineAdapter."""
         if self._adapter is None:
-            from openadapt_ml.baselines import (
-                TrackConfig,
-                UnifiedBaselineAdapter,
-            )
+            from openadapt_ml.baselines import TrackConfig, UnifiedBaselineAdapter
 
-            # Select track config
             track_configs = {
                 "A": TrackConfig.track_a(),
                 "B": TrackConfig.track_b(),
@@ -993,7 +606,6 @@ def _get_adapter(self):
             }
             track_config = track_configs.get(self.track, TrackConfig.track_a())
 
-            # Create adapter from alias
             self._adapter = UnifiedBaselineAdapter.from_alias(
                 self.model_alias,
                 track=track_config,
@@ -1011,21 +623,11 @@ def act(
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
     ) -> BenchmarkAction:
-        """Use UnifiedBaselineAdapter to determine next action.
-
-        Args:
-            observation: Current observation with screenshot.
-            task: Task being performed.
-            history: Previous observations and actions.
-
-        Returns:
-            BenchmarkAction parsed from adapter response.
-        """
+        """Use UnifiedBaselineAdapter to determine next action."""
         from PIL import Image
 
         adapter = self._get_adapter()
 
-        # Load screenshot if available
         screenshot = None
         if observation.screenshot_path:
             try:
@@ -1034,19 +636,16 @@ def act(
                 if self.verbose:
                     print(f"[UnifiedBaselineAgent] Failed to load screenshot: {e}")
 
-        # Build accessibility tree string
-        a11y_tree = None
-        if observation.accessibility_tree:
-            a11y_tree = observation.accessibility_tree
+        a11y_tree = (
+            observation.accessibility_tree if observation.accessibility_tree else None
+        )
 
-        # Build history for adapter
         adapter_history = None
         if history:
-            adapter_history = []
-            for obs, action in history[-5:]:  # Last 5 actions
-                adapter_history.append(self._benchmark_action_to_dict(action))
+            adapter_history = [
+                self._benchmark_action_to_dict(a) for _, a in history[-5:]
+            ]
 
-        # Call adapter
         try:
             parsed_action = adapter.predict(
                 screenshot=screenshot,
@@ -1057,18 +656,13 @@ def act(
         except Exception as e:
             if self.verbose:
                 print(f"[UnifiedBaselineAgent] Adapter error: {e}")
-            return BenchmarkAction(
-                type="done",
-                raw_action={"error": str(e)},
-            )
+            return BenchmarkAction(type="done", raw_action={"error": str(e)})
 
-        # Convert ParsedAction to BenchmarkAction
         return self._parsed_to_benchmark_action(parsed_action, observation)
 
     def _benchmark_action_to_dict(self, action: BenchmarkAction) -> dict[str, Any]:
         """Convert BenchmarkAction to dict for history."""
         result = {"type": action.type}
-
         if action.x is not None:
             result["x"] = action.x
         if action.y is not None:
@@ -1081,23 +675,12 @@ def _benchmark_action_to_dict(self, action: BenchmarkAction) -> dict[str, Any]:
             result["element_id"] = action.target_node_id
         if action.scroll_direction:
             result["direction"] = action.scroll_direction
-
         return result
 
     def _parsed_to_benchmark_action(
-        self,
-        parsed_action,
-        observation: BenchmarkObservation | None = None,
+        self, parsed_action, observation: BenchmarkObservation | None = None
     ) -> BenchmarkAction:
-        """Convert ParsedAction to BenchmarkAction.
-
-        Args:
-            parsed_action: ParsedAction from adapter.
-            observation: Current observation (for coordinate normalization).
-
-        Returns:
-            BenchmarkAction.
-        """
+        """Convert ParsedAction to BenchmarkAction."""
         raw_action = {
             "raw_response": parsed_action.raw_response,
             "thought": parsed_action.thought,
@@ -1117,36 +700,21 @@ def _parsed_to_benchmark_action(
                     raw_action=raw_action,
                 )
             elif parsed_action.x is not None and parsed_action.y is not None:
-                x = parsed_action.x
-                y = parsed_action.y
-
-                # Normalize coordinates if they appear to be pixel values
+                x, y = parsed_action.x, parsed_action.y
                 if observation and observation.viewport and (x > 1.0 or y > 1.0):
                     width, height = observation.viewport
                     raw_action["original_coords"] = {"x": x, "y": y}
-                    raw_action["normalized"] = True
-                    x = x / width
-                    y = y / height
-
-                return BenchmarkAction(
-                    type="click",
-                    x=x,
-                    y=y,
-                    raw_action=raw_action,
-                )
+                    x, y = x / width, y / height
+                return BenchmarkAction(type="click", x=x, y=y, raw_action=raw_action)
 
         elif action_type == "type":
             return BenchmarkAction(
-                type="type",
-                text=parsed_action.text,
-                raw_action=raw_action,
+                type="type", text=parsed_action.text, raw_action=raw_action
             )
 
         elif action_type == "key":
             return BenchmarkAction(
-                type="key",
-                key=parsed_action.key,
-                raw_action=raw_action,
+                type="key", key=parsed_action.key, raw_action=raw_action
             )
 
         elif action_type == "scroll":
@@ -1160,27 +728,20 @@ def _parsed_to_benchmark_action(
             return BenchmarkAction(type="done", raw_action=raw_action)
 
         elif action_type == "drag":
-            x = parsed_action.x
-            y = parsed_action.y
-            end_x = getattr(parsed_action, "end_x", None)
-            end_y = getattr(parsed_action, "end_y", None)
-
             return BenchmarkAction(
                 type="drag",
-                x=x,
-                y=y,
-                end_x=end_x,
-                end_y=end_y,
+                x=parsed_action.x,
+                y=parsed_action.y,
+                end_x=getattr(parsed_action, "end_x", None),
+                end_y=getattr(parsed_action, "end_y", None),
                 raw_action=raw_action,
             )
 
-        # Unknown action type, return done
         raw_action["unknown_action"] = action_type
         return BenchmarkAction(type="done", raw_action=raw_action)
 
     def reset(self) -> None:
         """Reset agent state."""
-        # UnifiedBaselineAgent is stateless, nothing to reset
         pass
 
     def __repr__(self) -> str:
diff --git a/openadapt_ml/benchmarks/azure.py b/openadapt_ml/benchmarks/azure.py
index 157c62b..f18f59c 100644
--- a/openadapt_ml/benchmarks/azure.py
+++ b/openadapt_ml/benchmarks/azure.py
@@ -36,8 +36,7 @@
 from pathlib import Path
 from typing import Callable
 
-from openadapt_ml.benchmarks.agent import BenchmarkAgent
-from openadapt_ml.benchmarks.base import BenchmarkResult, BenchmarkTask
+from openadapt_evals import BenchmarkAgent, BenchmarkResult, BenchmarkTask
 
 logger = logging.getLogger(__name__)
 
@@ -496,7 +495,7 @@ def run_evaluation(
             List of BenchmarkResult for all tasks.
         """
         # Load tasks
-        from openadapt_ml.benchmarks.waa import WAAAdapter
+        from openadapt_evals import WAAMockAdapter as WAAAdapter
 
         adapter = WAAAdapter(waa_repo_path=self.waa_repo_path)
         if task_ids:
diff --git a/openadapt_ml/benchmarks/base.py b/openadapt_ml/benchmarks/base.py
deleted file mode 100644
index 522914e..0000000
--- a/openadapt_ml/benchmarks/base.py
+++ /dev/null
@@ -1,368 +0,0 @@
-"""Base classes for benchmark integration.
-
-This module provides the core abstractions for integrating GUI agent benchmarks
-into openadapt-ml. It supports both interactive environments (WAA, OSWorld) and
-static trajectory datasets (Mind2Web).
-
-Example:
-    from openadapt_ml.benchmarks import WAAAdapter, evaluate_agent_on_benchmark
-
-    adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
-"""
-
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Iterator
-
-if TYPE_CHECKING:
-    pass
-
-
-@dataclass
-class BenchmarkTask:
-    """Canonical task representation.
-
-    Attributes:
-        task_id: Unique identifier for the task.
-        instruction: Natural language task instruction.
-        domain: Task domain ("web", "desktop", "mobile").
-        initial_state_ref: Reference to initial state (VM snapshot, URL, etc.).
-        time_limit_steps: Maximum steps allowed for the task.
-        raw_config: Original benchmark config (lossless preservation).
-        evaluation_spec: Benchmark-native evaluation specification.
-    """
-
-    task_id: str
-    instruction: str
-    domain: str  # "web", "desktop", "mobile"
-
-    # Environment setup
-    initial_state_ref: str | None = None  # VM snapshot, storage_state, start URL
-    time_limit_steps: int | None = None
-
-    # Preserve original config losslessly
-    raw_config: dict[str, Any] = field(default_factory=dict)
-
-    # Evaluation spec (benchmark-native)
-    evaluation_spec: dict[str, Any] | None = None
-
-
-@dataclass
-class BenchmarkObservation:
-    """Canonical observation at each step.
-
-    Supports multiple observation modalities:
-    - Visual: screenshots with viewport info
-    - Structured UI: accessibility tree (UIA/AXTree/DOM)
-    - Context: URL, window title, focused element
-
-    Attributes:
-        screenshot: PNG image bytes.
-        screenshot_path: Path to saved screenshot.
-        viewport: (width, height) of the viewport.
-        accessibility_tree: Platform-specific UI tree (UIA/AXTree/DOM).
-        dom_html: Raw HTML for web tasks.
-        url: Current URL for web tasks.
-        window_title: Active window title for desktop tasks.
-        focused_element: Currently focused UI element.
-        raw_observation: Original benchmark observation (lossless).
-    """
-
-    # Visual
-    screenshot: bytes | None = None  # PNG image bytes
-    screenshot_path: str | None = None
-    viewport: tuple[int, int] | None = None  # (width, height)
-
-    # Structured UI (format varies by platform)
-    accessibility_tree: dict | None = None  # UIA (Windows), AXTree (macOS), DOM (web)
-    dom_html: str | None = None  # Raw HTML for web
-
-    # Context
-    url: str | None = None  # For web tasks
-    window_title: str | None = None  # For desktop tasks
-    app_name: str | None = None  # Active application
-    focused_element: dict | None = None  # {node_id, bbox, text}
-
-    # Raw benchmark-specific data (lossless)
-    raw_observation: dict[str, Any] | None = None
-
-
-@dataclass
-class BenchmarkAction:
-    """Canonical action representation.
-
-    Supports multiple action types with both coordinate-based and element-based
-    grounding. The "grounding-first" approach stores both when available.
-
-    Attributes:
-        type: Action type ("click", "type", "scroll", "key", "drag", "answer", "done").
-        x: X coordinate (normalized [0,1] or pixels).
-        y: Y coordinate (normalized [0,1] or pixels).
-        target_node_id: Element ID from accessibility tree.
-        target_bbox: Element bounding box.
-        target_role: Element role (button, textfield, etc.).
-        target_name: Element accessible name.
-        text: Text to type (for "type" action).
-        key: Single key (for "key" action, e.g., "Enter", "Tab").
-        modifiers: Key modifiers (["ctrl", "shift", "alt"]).
-        scroll_direction: Scroll direction ("up", "down", "left", "right").
-        scroll_amount: Scroll amount (pixels or normalized).
-        end_x: Drag end X coordinate.
-        end_y: Drag end Y coordinate.
-        answer: Answer string (for benchmarks that score by answer).
-        raw_action: Original benchmark action (lossless).
-    """
-
-    type: str  # "click", "type", "scroll", "key", "drag", "answer", "done"
-
-    # Pointer actions - coordinates
-    x: float | None = None  # Normalized [0,1] or pixel
-    y: float | None = None
-
-    # Element grounding (when available)
-    target_node_id: str | None = None  # DOM/AX/UIA node ID
-    target_bbox: tuple[float, float, float, float] | None = None
-    target_role: str | None = None  # "button", "textfield", etc.
-    target_name: str | None = None  # Accessible name
-
-    # Keyboard actions
-    text: str | None = None  # For "type" action - text to type
-    key: str | None = None  # For "key" action - single key
-    modifiers: list[str] | None = None  # ["ctrl", "shift", "alt"]
-
-    # Scroll actions
-    scroll_direction: str | None = None  # "up", "down", "left", "right"
-    scroll_amount: float | None = None  # Pixels or normalized
-
-    # Drag actions
-    end_x: float | None = None
-    end_y: float | None = None
-
-    # Answer action (some benchmarks score by final answer)
-    answer: str | None = None
-
-    # Raw benchmark-specific format (lossless)
-    raw_action: dict[str, Any] | None = None
-
-
-@dataclass
-class BenchmarkResult:
-    """Result of a single task evaluation.
-
-    Attributes:
-        task_id: ID of the evaluated task.
-        success: Whether the task was completed successfully.
-        score: Score between 0.0 and 1.0.
-        steps: List of (observation, action) pairs from the trajectory.
-        num_steps: Number of steps taken.
-        error: Error message if task failed due to error.
-        reason: Explanation of success/failure.
-        total_time_seconds: Total time taken for the task.
-    """
-
-    task_id: str
-    success: bool
-    score: float  # 0.0 to 1.0
-
-    # Trajectory
-    steps: list[tuple[BenchmarkObservation, BenchmarkAction]] = field(
-        default_factory=list
-    )
-    num_steps: int = 0
-
-    # Diagnostics
-    error: str | None = None
-    reason: str | None = None  # Why success/fail
-
-    # Timing
-    total_time_seconds: float = 0.0
-
-
-@dataclass
-class UIElement:
-    """Normalized UI element for cross-platform use.
-
-    Provides a common representation for UI elements across platforms
-    (Windows UIA, macOS AXTree, web DOM).
-
-    Attributes:
-        node_id: Unique identifier for the element.
-        role: Element role (button, textfield, link, etc.).
-        name: Accessible name/label.
-        bbox: Bounding box (normalized [0,1] or pixels).
-        text: Text content.
-        value: Current value (for inputs).
-        children: Child elements.
-        attributes: Additional platform-specific attributes.
-    """
-
-    node_id: str
-    role: str  # "button", "textfield", "link", etc.
-    name: str | None = None  # Accessible name/label
-    bbox: tuple[float, float, float, float] | None = None  # (x1, y1, x2, y2)
-    text: str | None = None  # Text content
-    value: str | None = None  # Current value (for inputs)
-    children: list[UIElement] | None = None
-    attributes: dict[str, Any] | None = None  # Platform-specific
-
-
-class BenchmarkAdapter(ABC):
-    """Abstract interface for benchmark integration.
-
-    Subclasses implement this interface to integrate specific benchmarks
-    (WAA, OSWorld, WebArena, etc.) with openadapt-ml.
-
-    Two types of adapters:
-    - Interactive: Run environment, step through tasks (WAA, OSWorld)
-    - Static: Load trajectories for offline training/eval (Mind2Web)
-    """
-
-    @property
-    @abstractmethod
-    def name(self) -> str:
-        """Benchmark name (e.g., 'waa', 'osworld', 'webarena')."""
-        pass
-
-    @property
-    @abstractmethod
-    def benchmark_type(self) -> str:
-        """Benchmark type: 'interactive' or 'static'."""
-        pass
-
-    @property
-    def supports_parallel(self) -> bool:
-        """Whether the adapter supports parallel task execution."""
-        return False
-
-    @abstractmethod
-    def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
-        """List available tasks, optionally filtered by domain.
-
-        Args:
-            domain: Optional domain filter (e.g., "browser", "office").
-
-        Returns:
-            List of BenchmarkTask objects.
-        """
-        pass
-
-    @abstractmethod
-    def load_task(self, task_id: str) -> BenchmarkTask:
-        """Load a specific task by ID.
-
-        Args:
-            task_id: Task identifier.
-
-        Returns:
-            BenchmarkTask object.
-
-        Raises:
-            KeyError: If task_id not found.
-        """
-        pass
-
-    @abstractmethod
-    def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
-        """Reset environment to task's initial state.
-
-        Args:
-            task: Task to initialize.
-
-        Returns:
-            Initial observation.
-        """
-        pass
-
-    @abstractmethod
-    def step(
-        self, action: BenchmarkAction
-    ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
-        """Execute action and return new observation.
-
-        Args:
-            action: Action to execute.
-
-        Returns:
-            Tuple of (observation, done, info).
-        """
-        pass
-
-    @abstractmethod
-    def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Run benchmark's native evaluation on current state.
-
-        Args:
-            task: Task to evaluate.
-
-        Returns:
-            BenchmarkResult with success/score.
-        """
-        pass
-
-    def close(self) -> None:
-        """Clean up resources (VMs, browser, etc.)."""
-        pass
-
-    def __enter__(self) -> BenchmarkAdapter:
-        """Context manager entry."""
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
-        """Context manager exit."""
-        self.close()
-
-
-class StaticDatasetAdapter(BenchmarkAdapter):
-    """Base for static trajectory datasets (Mind2Web, demos).
-
-    Static adapters load pre-recorded trajectories for offline training
-    or evaluation, rather than running an interactive environment.
-    """
-
-    @property
-    def benchmark_type(self) -> str:
-        """Static datasets are not interactive."""
-        return "static"
-
-    @abstractmethod
-    def load_trajectories(
-        self, split: str = "test"
-    ) -> Iterator[
-        tuple[BenchmarkTask, list[tuple[BenchmarkObservation, BenchmarkAction]]]
-    ]:
-        """Iterate over expert trajectories.
-
-        Args:
-            split: Dataset split ("train", "val", "test").
-
-        Yields:
-            Tuples of (task, trajectory) where trajectory is a list of
-            (observation, action) pairs.
-        """
-        pass
-
-    def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
-        """Not supported for static datasets."""
-        raise NotImplementedError(
-            "Static datasets don't support interactive reset. "
-            "Use load_trajectories() instead."
-        )
-
-    def step(
-        self, action: BenchmarkAction
-    ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
-        """Not supported for static datasets."""
-        raise NotImplementedError(
-            "Static datasets don't support interactive stepping. "
-            "Use load_trajectories() instead."
-        )
-
-    def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Not supported for static datasets."""
-        raise NotImplementedError(
-            "Static datasets don't support execution-based evaluation. "
-            "Use offline metrics instead."
-        )
diff --git a/openadapt_ml/benchmarks/cli.py b/openadapt_ml/benchmarks/cli.py
index d16f231..90cb1ae 100644
--- a/openadapt_ml/benchmarks/cli.py
+++ b/openadapt_ml/benchmarks/cli.py
@@ -35,6 +35,7 @@
 import subprocess
 import sys
 import time
+import webbrowser
 from datetime import datetime
 from pathlib import Path
 from typing import Optional
@@ -43,7 +44,24 @@
 # Constants (single source of truth)
 # =============================================================================
 
-VM_SIZE = "Standard_D4ds_v4"
+# VM sizes with nested virtualization support
+# Standard: $0.19/hr, 4 vCPU, 16GB RAM - baseline
+# Fast: $0.38/hr, 8 vCPU, 32GB RAM - ~30% faster install, ~40% faster eval
+VM_SIZE_STANDARD = "Standard_D4ds_v4"
+VM_SIZE_FAST = "Standard_D8ds_v5"
+VM_SIZE = VM_SIZE_STANDARD  # Default, can be overridden by --fast flag
+
+# Fallback sizes for --fast mode (in order of preference)
+# D8ds_v5: First choice (v5 with local SSD)
+# D8s_v5: v5 without local SSD
+# D8ds_v4: v4 with local SSD
+# D8as_v5: AMD version
+VM_SIZE_FAST_FALLBACKS = [
+    ("Standard_D8ds_v5", 0.38),
+    ("Standard_D8s_v5", 0.36),
+    ("Standard_D8ds_v4", 0.38),
+    ("Standard_D8as_v5", 0.34),
+]
 VM_REGIONS = ["centralus", "eastus", "westus2", "eastus2"]
 VM_NAME = "waa-eval-vm"
 RESOURCE_GROUP = "openadapt-agents"
@@ -62,6 +80,36 @@
     "ConnectTimeout=10",
 ]
 
+
+def setup_vnc_tunnel_and_browser(ip: str) -> Optional[subprocess.Popen]:
+    """Set up SSH tunnel for VNC and open browser.
+
+    Returns the tunnel process on success, None on failure.
+    """
+    # Kill any existing tunnel on port 8006
+    subprocess.run(["pkill", "-f", "ssh.*8006:localhost:8006"], capture_output=True)
+
+    # Start SSH tunnel in background
+    tunnel_proc = subprocess.Popen(
+        ["ssh", *SSH_OPTS, "-N", "-L", "8006:localhost:8006", f"azureuser@{ip}"],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+
+    # Wait for tunnel to establish
+    time.sleep(2)
+
+    # Check if tunnel is running
+    if tunnel_proc.poll() is not None:
+        return None
+
+    # Open browser
+    vnc_url = "http://localhost:8006"
+    webbrowser.open(vnc_url)
+
+    return tunnel_proc
+
+
 # Dockerfile location (relative to this file)
 DOCKERFILE_PATH = Path(__file__).parent / "waa_deploy" / "Dockerfile"
 
@@ -287,7 +335,6 @@ def wait_for_ssh(ip: str, timeout: int = 120) -> bool:
 def cmd_create(args):
     """Create Azure VM with nested virtualization."""
     init_logging()
-    log("CREATE", f"Creating VM '{VM_NAME}' ({VM_SIZE})...")
 
     # Check if VM already exists
     ip = get_vm_ip()
@@ -296,49 +343,81 @@ def cmd_create(args):
         log("CREATE", "Use 'delete' first if you want to recreate")
         return 0
 
-    # Try regions until one works
+    # Determine which sizes to try
+    use_fast = getattr(args, "fast", False)
+    if use_fast:
+        # Try multiple fast sizes with fallbacks
+        sizes_to_try = VM_SIZE_FAST_FALLBACKS
+        log(
+            "CREATE",
+            f"Creating VM '{VM_NAME}' with --fast (trying multiple D8 sizes)...",
+        )
+    else:
+        # Standard mode: single size
+        sizes_to_try = [(VM_SIZE_STANDARD, 0.19)]
+        log("CREATE", f"Creating VM '{VM_NAME}' ({VM_SIZE_STANDARD}, $0.19/hr)...")
+
+    # Try size+region combinations until one works
     vm_created = False
-    for region in VM_REGIONS:
-        log("CREATE", f"Trying {region}...", end=" ")
+    successful_size = None
+    successful_cost = None
 
-        result = subprocess.run(
-            [
-                "az",
-                "vm",
-                "create",
-                "--resource-group",
-                RESOURCE_GROUP,
-                "--name",
-                VM_NAME,
-                "--location",
-                region,
-                "--image",
-                "Ubuntu2204",
-                "--size",
-                VM_SIZE,
-                "--admin-username",
-                "azureuser",
-                "--generate-ssh-keys",
-                "--public-ip-sku",
-                "Standard",
-            ],
-            capture_output=True,
-            text=True,
-        )
+    for vm_size, cost_per_hour in sizes_to_try:
+        log("CREATE", f"Trying size {vm_size} (${cost_per_hour:.2f}/hr)...")
 
-        if result.returncode == 0:
-            vm_info = json.loads(result.stdout)
-            ip = vm_info.get("publicIpAddress", "")
-            log("CREATE", f"created ({ip})")
-            vm_created = True
+        for region in VM_REGIONS:
+            log("CREATE", f"  {region}...", end=" ")
+
+            result = subprocess.run(
+                [
+                    "az",
+                    "vm",
+                    "create",
+                    "--resource-group",
+                    RESOURCE_GROUP,
+                    "--name",
+                    VM_NAME,
+                    "--location",
+                    region,
+                    "--image",
+                    "Ubuntu2204",
+                    "--size",
+                    vm_size,
+                    "--admin-username",
+                    "azureuser",
+                    "--generate-ssh-keys",
+                    "--public-ip-sku",
+                    "Standard",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            if result.returncode == 0:
+                vm_info = json.loads(result.stdout)
+                ip = vm_info.get("publicIpAddress", "")
+                log("CREATE", f"created ({ip})")
+                vm_created = True
+                successful_size = vm_size
+                successful_cost = cost_per_hour
+                break
+            else:
+                log("CREATE", "unavailable")
+
+        if vm_created:
             break
-        else:
-            log("CREATE", "unavailable")
 
     if not vm_created:
-        log("CREATE", "ERROR: Could not create VM in any region")
+        log("CREATE", "ERROR: Could not create VM in any region with any size")
+        if use_fast:
+            log("CREATE", "Tried sizes: " + ", ".join(s[0] for s in sizes_to_try))
         return 1
 
+    log(
+        "CREATE",
+        f"Successfully created {successful_size} (${successful_cost:.2f}/hr) in {region}",
+    )
+
     # Wait for SSH
     log("CREATE", "Waiting for SSH...")
     if not wait_for_ssh(ip):
@@ -628,7 +707,19 @@ def cmd_start(args):
     # - Downloads Windows 11 Enterprise if not present
     # - Boots QEMU VM
     # - Runs WAA server automatically via FirstLogonCommands
-    log("START", "Starting container with VERSION=11e...")
+    # QEMU resource allocation (--fast uses more resources on D8ds_v5)
+    if getattr(args, "fast", False):
+        ram_size = "16G"
+        cpu_cores = 6
+        log(
+            "START",
+            "Starting container with VERSION=11e (FAST mode: 6 cores, 16GB RAM)...",
+        )
+    else:
+        ram_size = "8G"
+        cpu_cores = 4
+        log("START", "Starting container with VERSION=11e...")
+
     docker_cmd = f"""docker run -d \\
   --name winarena \\
   --device=/dev/kvm \\
@@ -638,8 +729,8 @@ def cmd_start(args):
   -p 7200:7200 \\
   -v /mnt/waa-storage:/storage \\
   -e VERSION=11e \\
-  -e RAM_SIZE=8G \\
-  -e CPU_CORES=4 \\
+  -e RAM_SIZE={ram_size} \\
+  -e CPU_CORES={cpu_cores} \\
   -e DISK_SIZE=64G \\
   {DOCKER_IMAGE}"""
 
@@ -650,8 +741,22 @@ def cmd_start(args):
 
     log("START", "Container started")
     log("START", "Windows will boot and install (15-20 min on first run)")
-    log("START", "Monitor via: uv run python -m openadapt_ml.benchmarks.cli_v2 logs")
-    log("START", f"VNC (via SSH tunnel): ssh -L 8006:localhost:8006 azureuser@{ip}")
+
+    # Auto-launch VNC unless --no-vnc specified
+    if not getattr(args, "no_vnc", False):
+        log("START", "Auto-launching VNC viewer...")
+        tunnel_proc = setup_vnc_tunnel_and_browser(ip)
+        if tunnel_proc:
+            log(
+                "START",
+                f"VNC auto-launched at http://localhost:8006 (tunnel PID: {tunnel_proc.pid})",
+            )
+        else:
+            log("START", "WARNING: VNC tunnel failed to start")
+            log("START", f"Manual VNC: ssh -L 8006:localhost:8006 azureuser@{ip}")
+    else:
+        log("START", f"VNC (via SSH tunnel): ssh -L 8006:localhost:8006 azureuser@{ip}")
+
     return 0
 
 
@@ -827,6 +932,14 @@ def cmd_run(args):
         f"--domain {domain}",
     ]
 
+    # Add parallelization flags if specified (argparse converts hyphens to underscores)
+    worker_id = getattr(args, "worker_id", 0)
+    num_workers = getattr(args, "num_workers", 1)
+    if num_workers > 1:
+        run_args.append(f"--worker_id {worker_id}")
+        run_args.append(f"--num_workers {num_workers}")
+        log("RUN", f"Parallel mode: worker {worker_id}/{num_workers}")
+
     # If specific task requested, create custom test config
     if task:
         create_custom_test_cmd = f'''
@@ -1255,13 +1368,34 @@ def cmd_deallocate(args):
 
     if result.returncode == 0:
         log("DEALLOCATE", "VM deallocated (billing stopped)")
-        log("DEALLOCATE", "Use 'az vm start' to resume")
+        log("DEALLOCATE", "Use 'vm-start' to resume")
         return 0
     else:
         log("DEALLOCATE", f"ERROR: {result.stderr}")
         return 1
 
 
+def cmd_vm_start(args):
+    """Start a deallocated VM."""
+    init_logging()
+    log("VM-START", f"Starting VM '{VM_NAME}'...")
+
+    result = subprocess.run(
+        ["az", "vm", "start", "-g", RESOURCE_GROUP, "-n", VM_NAME],
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode == 0:
+        ip = get_vm_ip()
+        log("VM-START", f"VM started: {ip}")
+        log("VM-START", "Run 'build' then 'start' to launch WAA container")
+        return 0
+    else:
+        log("VM-START", f"ERROR: {result.stderr}")
+        return 1
+
+
 def cmd_exec(args):
     """Run command on VM host."""
     ip = get_vm_ip()
@@ -1348,6 +1482,78 @@ def cmd_vnc(args):
     return 0
 
 
+def _show_benchmark_progress(ip: str) -> int:
+    """Show benchmark progress with estimated completion time.
+
+    Parses the run log to count completed tasks and estimate remaining time.
+    """
+    # Find the most recent run log
+    result = ssh_run(
+        ip, "ls -t /home/azureuser/cli_logs/run_*.log 2>/dev/null | head -1"
+    )
+    log_file = result.stdout.strip()
+
+    if not log_file:
+        print("No benchmark running. Start one with: run --num-tasks N")
+        return 1
+
+    # Get task count and timestamps
+    result = ssh_run(
+        ip,
+        f"""
+        echo "=== WAA Benchmark Progress ==="
+        echo ""
+
+        # Count completed tasks (each "Result:" line = 1 task done)
+        COMPLETED=$(grep -c "Result:" {log_file} 2>/dev/null || echo 0)
+        # Count total tasks from task list (sum of all domain counts)
+        TOTAL=$(grep -A20 "Left tasks:" {log_file} | grep -E "^[a-z_]+: [0-9]+" | awk -F': ' '{{sum+=$2}} END {{print sum}}')
+        [ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ] && TOTAL=154
+
+        # Get timestamps
+        FIRST_TS=$(grep -oE '\\[2026-[0-9-]+ [0-9:]+' {log_file} | head -1 | tr -d '[')
+        LAST_TS=$(grep -oE '\\[2026-[0-9-]+ [0-9:]+' {log_file} | tail -1 | tr -d '[')
+
+        echo "Log: {log_file}"
+        echo "Started: $FIRST_TS"
+        echo "Latest:  $LAST_TS"
+        echo ""
+        echo "Tasks completed: $COMPLETED / $TOTAL"
+
+        # Calculate elapsed minutes
+        if [ -n "$FIRST_TS" ] && [ -n "$LAST_TS" ]; then
+            START_H=$(echo "$FIRST_TS" | awk '{{print $2}}' | cut -d: -f1)
+            START_M=$(echo "$FIRST_TS" | awk '{{print $2}}' | cut -d: -f2)
+            NOW_H=$(echo "$LAST_TS" | awk '{{print $2}}' | cut -d: -f1)
+            NOW_M=$(echo "$LAST_TS" | awk '{{print $2}}' | cut -d: -f2)
+
+            ELAPSED_MIN=$(( (NOW_H - START_H) * 60 + (NOW_M - START_M) ))
+            echo "Elapsed: $ELAPSED_MIN minutes"
+
+            if [ "$COMPLETED" -gt 0 ] && [ "$ELAPSED_MIN" -gt 0 ]; then
+                MIN_PER_TASK=$((ELAPSED_MIN / COMPLETED))
+                REMAINING=$((TOTAL - COMPLETED))
+                EST_MIN=$((REMAINING * MIN_PER_TASK))
+                EST_H=$((EST_MIN / 60))
+                EST_M=$((EST_MIN % 60))
+
+                echo ""
+                echo "Avg time per task: ~$MIN_PER_TASK min"
+                echo "Remaining tasks: $REMAINING"
+                echo "Estimated remaining: ~${{EST_H}}h ${{EST_M}}m"
+
+                # Progress bar
+                PCT=$((COMPLETED * 100 / TOTAL))
+                echo ""
+                echo "Progress: $PCT% [$COMPLETED/$TOTAL]"
+            fi
+        fi
+        """,
+    )
+    print(result.stdout)
+    return 0
+
+
 def _show_run_logs(ip: str, follow: bool = False, tail: Optional[int] = None) -> int:
     """Show the most recent run command log file.
 
@@ -1409,12 +1615,17 @@ def cmd_logs(args):
     Default behavior shows all relevant logs (docker, storage, probe status).
     Use --follow to stream docker logs continuously.
     Use --run to show run command output instead of container logs.
+    Use --progress to show benchmark progress and ETA.
     """
     ip = get_vm_ip()
     if not ip:
         print("ERROR: VM not found")
         return 1
 
+    # Handle --progress flag: show benchmark progress
+    if getattr(args, "progress", False):
+        return _show_benchmark_progress(ip)
+
     # Handle --run flag: show run command output
     if args.run:
         return _show_run_logs(ip, args.follow, args.tail)
@@ -1630,6 +1841,11 @@ def main():
 
     # create
     p_create = subparsers.add_parser("create", help="Create Azure VM")
+    p_create.add_argument(
+        "--fast",
+        action="store_true",
+        help="Use larger VM (D8ds_v5, $0.38/hr) for ~30%% faster install, ~40%% faster eval",
+    )
     p_create.set_defaults(func=cmd_create)
 
     # delete
@@ -1651,6 +1867,14 @@ def main():
     p_start.add_argument(
         "--fresh", action="store_true", help="Clean storage for fresh Windows install"
     )
+    p_start.add_argument(
+        "--no-vnc", action="store_true", help="Don't auto-launch VNC viewer"
+    )
+    p_start.add_argument(
+        "--fast",
+        action="store_true",
+        help="Allocate more CPU/RAM to QEMU (use with D8ds_v5 VM)",
+    )
     p_start.set_defaults(func=cmd_start)
 
     # stop
@@ -1693,6 +1917,18 @@ def main():
     p_run.add_argument(
         "--no-download", action="store_true", help="Skip downloading results"
     )
+    p_run.add_argument(
+        "--worker-id",
+        type=int,
+        default=0,
+        help="Worker ID for parallel execution (0-indexed)",
+    )
+    p_run.add_argument(
+        "--num-workers",
+        type=int,
+        default=1,
+        help="Total number of parallel workers",
+    )
     p_run.set_defaults(func=cmd_run)
 
     # download
@@ -1720,6 +1956,10 @@ def main():
     p_dealloc = subparsers.add_parser("deallocate", help="Stop VM (preserves disk)")
     p_dealloc.set_defaults(func=cmd_deallocate)
 
+    # vm-start
+    p_vmstart = subparsers.add_parser("vm-start", help="Start a deallocated VM")
+    p_vmstart.set_defaults(func=cmd_vm_start)
+
     # logs
     p_logs = subparsers.add_parser("logs", help="Show WAA status and logs")
     p_logs.add_argument(
@@ -1733,6 +1973,12 @@ def main():
         action="store_true",
         help="Show run command output instead of container logs",
     )
+    p_logs.add_argument(
+        "--progress",
+        "-p",
+        action="store_true",
+        help="Show benchmark progress and estimated completion time",
+    )
     p_logs.set_defaults(func=cmd_logs)
 
     # exec
diff --git a/openadapt_ml/benchmarks/data_collection.py b/openadapt_ml/benchmarks/data_collection.py
deleted file mode 100644
index 8147a94..0000000
--- a/openadapt_ml/benchmarks/data_collection.py
+++ /dev/null
@@ -1,444 +0,0 @@
-"""Data collection for benchmark viewer integration.
-
-This module handles saving execution traces during benchmark runs for later
-replay in the benchmark viewer. It creates a structured directory layout with
-screenshots, metadata, and execution traces.
-
-Directory structure:
-    benchmark_results/
-    ├── waa_eval_YYYYMMDD_HHMMSS/
-    │   ├── metadata.json
-    │   ├── tasks/
-    │   │   ├── task_001/
-    │   │   │   ├── task.json
-    │   │   │   ├── screenshots/
-    │   │   │   │   ├── step_000.png
-    │   │   │   │   ├── step_001.png
-    │   │   │   │   └── ...
-    │   │   │   └── execution.json
-    │   │   └── task_002/
-    │   │       └── ...
-    │   └── summary.json
-
-Example:
-    from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
-
-    collector = ExecutionTraceCollector(
-        benchmark_name="waa",
-        run_name="waa_eval_20241214",
-        model_id="qwen3vl-2b-epoch5"
-    )
-
-    # During evaluation
-    collector.start_task(task)
-    for step_idx, (obs, action) in enumerate(trajectory):
-        collector.record_step(step_idx, obs, action, reasoning="...")
-    collector.finish_task(result)
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-from dataclasses import asdict, dataclass
-from datetime import datetime
-from pathlib import Path
-from typing import Any
-
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-)
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ExecutionStep:
-    """Single step in execution trace.
-
-    Attributes:
-        step_idx: Step index in the trajectory.
-        screenshot_path: Relative path to screenshot image.
-        action: Action taken at this step.
-        reasoning: Optional reasoning/thought from the agent.
-        timestamp: Timestamp when step was recorded.
-    """
-
-    step_idx: int
-    screenshot_path: str | None
-    action: dict[str, Any]  # Serialized BenchmarkAction
-    reasoning: str | None = None
-    timestamp: float | None = None
-
-
-class ExecutionTraceCollector:
-    """Collects execution traces during benchmark runs.
-
-    This class handles:
-    - Creating the directory structure for a benchmark run
-    - Saving screenshots at each step
-    - Recording actions and reasoning
-    - Saving task results and metadata
-
-    Args:
-        benchmark_name: Name of the benchmark (e.g., "waa", "webarena").
-        run_name: Unique name for this evaluation run (e.g., "waa_eval_20241214").
-        model_id: Identifier for the model being evaluated.
-        output_dir: Base directory for benchmark results (default: "./benchmark_results").
-    """
-
-    def __init__(
-        self,
-        benchmark_name: str,
-        run_name: str | None = None,
-        model_id: str = "unknown",
-        output_dir: str | Path = "benchmark_results",
-    ):
-        self.benchmark_name = benchmark_name
-        self.model_id = model_id
-
-        # Auto-generate run_name if not provided
-        if run_name is None:
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            run_name = f"{benchmark_name}_eval_{timestamp}"
-        self.run_name = run_name
-
-        # Set up directory structure
-        self.output_dir = Path(output_dir)
-        self.run_dir = self.output_dir / run_name
-        self.tasks_dir = self.run_dir / "tasks"
-
-        # Current task tracking
-        self._current_task: BenchmarkTask | None = None
-        self._current_task_dir: Path | None = None
-        self._current_screenshots_dir: Path | None = None
-        self._current_steps: list[ExecutionStep] = []
-
-        # Initialize run
-        self._initialize_run()
-
-    def _initialize_run(self) -> None:
-        """Initialize the benchmark run directory and metadata."""
-        self.run_dir.mkdir(parents=True, exist_ok=True)
-        self.tasks_dir.mkdir(exist_ok=True)
-
-        # Save run metadata
-        metadata = {
-            "benchmark_name": self.benchmark_name,
-            "run_name": self.run_name,
-            "model_id": self.model_id,
-            "created_at": datetime.now().isoformat(),
-        }
-
-        metadata_path = self.run_dir / "metadata.json"
-        with open(metadata_path, "w") as f:
-            json.dump(metadata, f, indent=2)
-
-        logger.info(f"Initialized benchmark run at: {self.run_dir}")
-
-    def start_task(self, task: BenchmarkTask) -> None:
-        """Start collecting data for a new task.
-
-        Args:
-            task: The benchmark task being executed.
-        """
-        if self._current_task is not None:
-            logger.warning(
-                f"Starting new task {task.task_id} without finishing {self._current_task.task_id}"
-            )
-
-        self._current_task = task
-        self._current_steps = []
-
-        # Create task directory
-        task_dir_name = self._sanitize_task_id(task.task_id)
-        self._current_task_dir = self.tasks_dir / task_dir_name
-        self._current_task_dir.mkdir(parents=True, exist_ok=True)
-
-        # Create screenshots directory
-        self._current_screenshots_dir = self._current_task_dir / "screenshots"
-        self._current_screenshots_dir.mkdir(exist_ok=True)
-
-        # Save task definition
-        task_data = {
-            "task_id": task.task_id,
-            "instruction": task.instruction,
-            "domain": task.domain,
-            "initial_state_ref": task.initial_state_ref,
-            "time_limit_steps": task.time_limit_steps,
-            "raw_config": task.raw_config,
-            "evaluation_spec": task.evaluation_spec,
-        }
-
-        task_path = self._current_task_dir / "task.json"
-        with open(task_path, "w") as f:
-            json.dump(task_data, f, indent=2)
-
-        logger.info(f"Started collecting data for task: {task.task_id}")
-
-    def record_step(
-        self,
-        step_idx: int,
-        observation: BenchmarkObservation,
-        action: BenchmarkAction,
-        reasoning: str | None = None,
-    ) -> None:
-        """Record a single step in the execution trace.
-
-        Args:
-            step_idx: Index of this step in the trajectory.
-            observation: Observation at this step.
-            action: Action taken at this step.
-            reasoning: Optional reasoning/thought from the agent.
-        """
-        if self._current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-
-        # Save screenshot if available
-        screenshot_path = None
-        if observation.screenshot is not None:
-            screenshot_path = self._save_screenshot(step_idx, observation.screenshot)
-        elif observation.screenshot_path is not None:
-            # Copy existing screenshot
-            screenshot_path = self._copy_screenshot(
-                step_idx, observation.screenshot_path
-            )
-
-        # Create execution step record
-        step = ExecutionStep(
-            step_idx=step_idx,
-            screenshot_path=screenshot_path,
-            action=self._serialize_action(action),
-            reasoning=reasoning,
-            timestamp=datetime.now().timestamp(),
-        )
-
-        self._current_steps.append(step)
-
-    def finish_task(self, result: BenchmarkResult) -> None:
-        """Finish collecting data for the current task and save execution trace.
-
-        Args:
-            result: The evaluation result for the task.
-        """
-        if self._current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-
-        # Save execution trace
-        execution_data = {
-            "task_id": result.task_id,
-            "model_id": self.model_id,
-            "success": result.success,
-            "score": result.score,
-            "num_steps": result.num_steps,
-            "total_time_seconds": result.total_time_seconds,
-            "error": result.error,
-            "reason": result.reason,
-            "steps": [asdict(step) for step in self._current_steps],
-        }
-
-        execution_path = self._current_task_dir / "execution.json"
-        with open(execution_path, "w") as f:
-            json.dump(execution_data, f, indent=2)
-
-        logger.info(
-            f"Saved execution trace for task {result.task_id}: "
-            f"{'SUCCESS' if result.success else 'FAIL'} ({result.num_steps} steps)"
-        )
-
-        # Clear current task
-        self._current_task = None
-        self._current_task_dir = None
-        self._current_screenshots_dir = None
-        self._current_steps = []
-
-    def save_summary(self, all_results: list[BenchmarkResult]) -> None:
-        """Save summary of all task results.
-
-        Args:
-            all_results: List of all BenchmarkResult objects from the run.
-        """
-        summary = {
-            "benchmark_name": self.benchmark_name,
-            "run_name": self.run_name,
-            "model_id": self.model_id,
-            "num_tasks": len(all_results),
-            "num_success": sum(1 for r in all_results if r.success),
-            "success_rate": sum(1 for r in all_results if r.success) / len(all_results)
-            if all_results
-            else 0.0,
-            "avg_score": sum(r.score for r in all_results) / len(all_results)
-            if all_results
-            else 0.0,
-            "avg_steps": sum(r.num_steps for r in all_results) / len(all_results)
-            if all_results
-            else 0.0,
-            "avg_time_seconds": sum(r.total_time_seconds for r in all_results)
-            / len(all_results)
-            if all_results
-            else 0.0,
-            "tasks": [
-                {
-                    "task_id": r.task_id,
-                    "success": r.success,
-                    "score": r.score,
-                    "num_steps": r.num_steps,
-                    "error": r.error,
-                }
-                for r in all_results
-            ],
-        }
-
-        summary_path = self.run_dir / "summary.json"
-        with open(summary_path, "w") as f:
-            json.dump(summary, f, indent=2)
-
-        logger.info(
-            f"Saved summary: {summary['num_success']}/{summary['num_tasks']} tasks succeeded "
-            f"({summary['success_rate']:.1%})"
-        )
-
-    def _save_screenshot(self, step_idx: int, screenshot_bytes: bytes) -> str:
-        """Save screenshot bytes to file.
-
-        Args:
-            step_idx: Step index for naming the file.
-            screenshot_bytes: PNG image bytes.
-
-        Returns:
-            Relative path to the saved screenshot.
-        """
-        if self._current_screenshots_dir is None:
-            raise RuntimeError("No task started")
-
-        filename = f"step_{step_idx:03d}.png"
-        screenshot_path = self._current_screenshots_dir / filename
-
-        with open(screenshot_path, "wb") as f:
-            f.write(screenshot_bytes)
-
-        # Return relative path from task directory
-        return f"screenshots/{filename}"
-
-    def _copy_screenshot(self, step_idx: int, source_path: str) -> str:
-        """Copy screenshot from existing path.
-
-        Args:
-            step_idx: Step index for naming the file.
-            source_path: Path to existing screenshot.
-
-        Returns:
-            Relative path to the copied screenshot.
-        """
-        if self._current_screenshots_dir is None:
-            raise RuntimeError("No task started")
-
-        filename = f"step_{step_idx:03d}.png"
-        dest_path = self._current_screenshots_dir / filename
-
-        # Copy file
-        import shutil
-
-        shutil.copy2(source_path, dest_path)
-
-        return f"screenshots/{filename}"
-
-    def _serialize_action(self, action: BenchmarkAction) -> dict[str, Any]:
-        """Serialize BenchmarkAction to dict.
-
-        Args:
-            action: Action to serialize.
-
-        Returns:
-            Dictionary representation of the action.
-        """
-        return {
-            "type": action.type,
-            "x": action.x,
-            "y": action.y,
-            "target_node_id": action.target_node_id,
-            "target_bbox": action.target_bbox,
-            "target_role": action.target_role,
-            "target_name": action.target_name,
-            "text": action.text,
-            "key": action.key,
-            "modifiers": action.modifiers,
-            "scroll_direction": action.scroll_direction,
-            "scroll_amount": action.scroll_amount,
-            "end_x": action.end_x,
-            "end_y": action.end_y,
-            "answer": action.answer,
-            "raw_action": action.raw_action,
-        }
-
-    def _sanitize_task_id(self, task_id: str) -> str:
-        """Sanitize task ID for use as directory name.
-
-        Args:
-            task_id: Original task ID.
-
-        Returns:
-            Sanitized task ID safe for filesystem.
-        """
-        # Replace unsafe characters with underscores
-        safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in task_id)
-        return safe_id
-
-
-def save_execution_trace(
-    task: BenchmarkTask,
-    result: BenchmarkResult,
-    trajectory: list[tuple[BenchmarkObservation, BenchmarkAction]],
-    benchmark_name: str,
-    model_id: str = "unknown",
-    output_dir: str | Path = "benchmark_results",
-    run_name: str | None = None,
-    reasoning_map: dict[int, str] | None = None,
-) -> Path:
-    """Convenience function to save a complete execution trace.
-
-    This is a simpler alternative to using ExecutionTraceCollector directly
-    when you have the complete trajectory available.
-
-    Args:
-        task: The benchmark task.
-        result: The evaluation result.
-        trajectory: List of (observation, action) pairs.
-        benchmark_name: Name of the benchmark.
-        model_id: Identifier for the model.
-        output_dir: Base directory for results.
-        run_name: Optional run name (auto-generated if None).
-        reasoning_map: Optional map of step_idx -> reasoning text.
-
-    Returns:
-        Path to the task directory.
-
-    Example:
-        save_execution_trace(
-            task=task,
-            result=result,
-            trajectory=trajectory,
-            benchmark_name="waa",
-            model_id="qwen3vl-2b-epoch5",
-            reasoning_map={0: "I should click the button", 1: "Now type the text"}
-        )
-    """
-    collector = ExecutionTraceCollector(
-        benchmark_name=benchmark_name,
-        run_name=run_name,
-        model_id=model_id,
-        output_dir=output_dir,
-    )
-
-    collector.start_task(task)
-
-    for step_idx, (obs, action) in enumerate(trajectory):
-        reasoning = reasoning_map.get(step_idx) if reasoning_map else None
-        collector.record_step(step_idx, obs, action, reasoning)
-
-    collector.finish_task(result)
-
-    return collector._current_task_dir or collector.tasks_dir
diff --git a/openadapt_ml/benchmarks/live_tracker.py b/openadapt_ml/benchmarks/live_tracker.py
deleted file mode 100644
index 4126a8d..0000000
--- a/openadapt_ml/benchmarks/live_tracker.py
+++ /dev/null
@@ -1,188 +0,0 @@
-"""Live evaluation progress tracker for benchmark viewer.
-
-This module provides a tracker that writes real-time evaluation progress
-to a JSON file that the viewer can poll via /api/benchmark-live.
-"""
-
-from __future__ import annotations
-
-import json
-from dataclasses import asdict, dataclass
-from pathlib import Path
-from typing import Any
-
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-)
-
-
-@dataclass
-class LiveStepData:
-    """Data for a single step in live evaluation."""
-
-    step_idx: int
-    action: dict[str, Any]
-    reasoning: str | None = None
-    screenshot_url: str | None = None
-
-
-@dataclass
-class LiveTaskData:
-    """Data for current task being evaluated."""
-
-    task_id: str
-    instruction: str
-    domain: str
-    steps: list[LiveStepData]
-    result: dict[str, Any] | None = None
-
-
-class LiveEvaluationTracker:
-    """Tracks live evaluation progress and writes to benchmark_live.json.
-
-    This class is designed to be used alongside ExecutionTraceCollector
-    to provide real-time progress updates to the viewer.
-
-    Args:
-        output_file: Path to output JSON file (default: benchmark_live.json).
-        total_tasks: Total number of tasks to evaluate.
-    """
-
-    def __init__(
-        self,
-        output_file: str | Path = "benchmark_live.json",
-        total_tasks: int = 0,
-    ):
-        self.output_file = Path(output_file)
-        self.total_tasks = total_tasks
-        self.tasks_completed = 0
-        self.current_task: LiveTaskData | None = None
-
-        # Initialize with idle state
-        self._write_state({"status": "idle"})
-
-    def start_task(self, task: BenchmarkTask) -> None:
-        """Start tracking a new task.
-
-        Args:
-            task: The benchmark task being evaluated.
-        """
-        self.current_task = LiveTaskData(
-            task_id=task.task_id,
-            instruction=task.instruction,
-            domain=task.domain or "unknown",
-            steps=[],
-            result=None,
-        )
-
-        self._write_state(
-            {
-                "status": "running",
-                "total_tasks": self.total_tasks,
-                "tasks_completed": self.tasks_completed,
-                "current_task": asdict(self.current_task),
-            }
-        )
-
-    def record_step(
-        self,
-        step_idx: int,
-        observation: BenchmarkObservation,
-        action: BenchmarkAction,
-        reasoning: str | None = None,
-    ) -> None:
-        """Record a step in the current task.
-
-        Args:
-            step_idx: Index of this step.
-            observation: Observation at this step.
-            action: Action taken at this step.
-            reasoning: Optional reasoning/thought from agent.
-        """
-        if self.current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-
-        # Serialize action
-        action_data = {
-            "type": action.type,
-            "x": action.x,
-            "y": action.y,
-            "target_node_id": action.target_node_id,
-            "text": action.text,
-            "key": action.key,
-        }
-
-        # Create step data
-        step = LiveStepData(
-            step_idx=step_idx,
-            action=action_data,
-            reasoning=reasoning,
-            screenshot_url=None,  # Could be populated if we serve screenshots
-        )
-
-        self.current_task.steps.append(step)
-
-        # Write updated state
-        self._write_state(
-            {
-                "status": "running",
-                "total_tasks": self.total_tasks,
-                "tasks_completed": self.tasks_completed,
-                "current_task": asdict(self.current_task),
-            }
-        )
-
-    def finish_task(self, result: BenchmarkResult) -> None:
-        """Finish tracking the current task.
-
-        Args:
-            result: The evaluation result for the task.
-        """
-        if self.current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-
-        # Add result to current task
-        self.current_task.result = {
-            "success": result.success,
-            "score": result.score,
-            "num_steps": result.num_steps,
-            "total_time_seconds": result.total_time_seconds,
-        }
-
-        # Increment completed count
-        self.tasks_completed += 1
-
-        # Write updated state
-        self._write_state(
-            {
-                "status": "running",
-                "total_tasks": self.total_tasks,
-                "tasks_completed": self.tasks_completed,
-                "current_task": asdict(self.current_task),
-            }
-        )
-
-        # Clear current task
-        self.current_task = None
-
-    def finish(self) -> None:
-        """Mark evaluation as complete."""
-        self._write_state(
-            {
-                "status": "complete",
-                "total_tasks": self.total_tasks,
-                "tasks_completed": self.tasks_completed,
-            }
-        )
-
-    def _write_state(self, state: dict[str, Any]) -> None:
-        """Write current state to JSON file.
-
-        Args:
-            state: State dictionary to write.
-        """
-        with open(self.output_file, "w") as f:
-            json.dump(state, f, indent=2)
diff --git a/openadapt_ml/benchmarks/runner.py b/openadapt_ml/benchmarks/runner.py
deleted file mode 100644
index 320af27..0000000
--- a/openadapt_ml/benchmarks/runner.py
+++ /dev/null
@@ -1,432 +0,0 @@
-"""Evaluation runner for benchmarks.
-
-This module provides functions to run agents on benchmarks and collect results.
-
-Example:
-    from openadapt_ml.benchmarks import WAAAdapter, PolicyAgent, evaluate_agent_on_benchmark
-
-    adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
-    agent = PolicyAgent(policy)
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
-
-    print(f"Success rate: {sum(r.success for r in results) / len(results):.1%}")
-"""
-
-from __future__ import annotations
-
-import logging
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import dataclass
-from typing import Callable
-
-from openadapt_ml.benchmarks.agent import BenchmarkAgent
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAdapter,
-    BenchmarkAction,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-)
-from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
-from openadapt_ml.benchmarks.live_tracker import LiveEvaluationTracker
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class EvaluationConfig:
-    """Configuration for benchmark evaluation.
-
-    Attributes:
-        max_steps: Maximum steps per task.
-        parallel: Number of parallel workers (if supported).
-        save_trajectories: Whether to save full trajectories in results.
-        verbose: Whether to print progress.
-        on_step: Optional callback called after each step.
-        on_task_complete: Optional callback called after each task.
-        save_execution_traces: Whether to save execution traces for viewer.
-        model_id: Model identifier for execution traces.
-        output_dir: Output directory for benchmark results.
-        run_name: Name for this evaluation run.
-        enable_live_tracking: Whether to enable live evaluation progress tracking.
-        live_tracking_file: Path to live tracking JSON file.
-    """
-
-    max_steps: int = 50
-    parallel: int = 1
-    save_trajectories: bool = True
-    verbose: bool = True
-    on_step: Callable[[BenchmarkObservation, BenchmarkAction, int], None] | None = None
-    on_task_complete: Callable[[BenchmarkResult], None] | None = None
-    save_execution_traces: bool = True
-    model_id: str = "unknown"
-    output_dir: str = "benchmark_results"
-    run_name: str | None = None
-    enable_live_tracking: bool = True
-    live_tracking_file: str = "benchmark_live.json"
-
-
-def evaluate_agent_on_benchmark(
-    agent: BenchmarkAgent,
-    adapter: BenchmarkAdapter,
-    task_ids: list[str] | None = None,
-    max_steps: int = 50,
-    parallel: int = 1,
-    config: EvaluationConfig | None = None,
-) -> list[BenchmarkResult]:
-    """Run agent on benchmark tasks and collect results.
-
-    Args:
-        agent: Agent to evaluate.
-        adapter: Benchmark adapter.
-        task_ids: Specific tasks to run (None = all tasks).
-        max_steps: Maximum steps per task (overridden by config if provided).
-        parallel: Number of parallel workers (overridden by config if provided).
-        config: Full evaluation configuration.
-
-    Returns:
-        List of BenchmarkResult for each task.
-    """
-    if config is None:
-        config = EvaluationConfig(max_steps=max_steps, parallel=parallel)
-
-    # Load tasks
-    if task_ids is not None:
-        tasks = [adapter.load_task(tid) for tid in task_ids]
-    else:
-        tasks = adapter.list_tasks()
-
-    if config.verbose:
-        logger.info(f"Evaluating {len(tasks)} tasks on {adapter.name}")
-
-    # Initialize execution trace collector if enabled
-    trace_collector = None
-    if config.save_execution_traces:
-        trace_collector = ExecutionTraceCollector(
-            benchmark_name=adapter.name,
-            run_name=config.run_name,
-            model_id=config.model_id,
-            output_dir=config.output_dir,
-        )
-        if config.verbose:
-            logger.info(f"Saving execution traces to: {trace_collector.run_dir}")
-
-    # Initialize live evaluation tracker if enabled
-    live_tracker = None
-    if config.enable_live_tracking:
-        live_tracker = LiveEvaluationTracker(
-            output_file=config.live_tracking_file,
-            total_tasks=len(tasks),
-        )
-        if config.verbose:
-            logger.info(f"Live tracking enabled: {config.live_tracking_file}")
-
-    # Run evaluation
-    if config.parallel > 1 and adapter.supports_parallel:
-        results = _evaluate_parallel(
-            agent, adapter, tasks, config, trace_collector, live_tracker
-        )
-    else:
-        results = _evaluate_sequential(
-            agent, adapter, tasks, config, trace_collector, live_tracker
-        )
-
-    # Save summary if trace collection is enabled
-    if trace_collector is not None:
-        trace_collector.save_summary(results)
-
-    # Mark live tracking as complete
-    if live_tracker is not None:
-        live_tracker.finish()
-
-    # Log summary
-    if config.verbose:
-        success_count = sum(1 for r in results if r.success)
-        success_rate = success_count / len(results) if results else 0
-        avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0
-        logger.info(
-            f"Evaluation complete: {success_count}/{len(results)} "
-            f"({success_rate:.1%}) success, {avg_steps:.1f} avg steps"
-        )
-
-    return results
-
-
-def _evaluate_sequential(
-    agent: BenchmarkAgent,
-    adapter: BenchmarkAdapter,
-    tasks: list[BenchmarkTask],
-    config: EvaluationConfig,
-    trace_collector: ExecutionTraceCollector | None = None,
-    live_tracker: LiveEvaluationTracker | None = None,
-) -> list[BenchmarkResult]:
-    """Run evaluation sequentially.
-
-    Args:
-        agent: Agent to evaluate.
-        adapter: Benchmark adapter.
-        tasks: Tasks to evaluate.
-        config: Evaluation configuration.
-        trace_collector: Optional trace collector for saving execution data.
-        live_tracker: Optional live evaluation tracker.
-
-    Returns:
-        List of results.
-    """
-    results = []
-    for i, task in enumerate(tasks):
-        if config.verbose:
-            logger.info(f"Task {i + 1}/{len(tasks)}: {task.task_id}")
-
-        result = _run_single_task(
-            agent, adapter, task, config, trace_collector, live_tracker
-        )
-        results.append(result)
-
-        if config.on_task_complete:
-            config.on_task_complete(result)
-
-    return results
-
-
-def _evaluate_parallel(
-    agent: BenchmarkAgent,
-    adapter: BenchmarkAdapter,
-    tasks: list[BenchmarkTask],
-    config: EvaluationConfig,
-    trace_collector: ExecutionTraceCollector | None = None,
-    live_tracker: LiveEvaluationTracker | None = None,
-) -> list[BenchmarkResult]:
-    """Run evaluation in parallel.
-
-    Note: This requires the adapter to support parallel execution
-    (e.g., via multiple VM instances).
-
-    Args:
-        agent: Agent to evaluate.
-        adapter: Benchmark adapter.
-        tasks: Tasks to evaluate.
-        config: Evaluation configuration.
-        trace_collector: Optional trace collector for saving execution data.
-        live_tracker: Optional live evaluation tracker.
-
-    Returns:
-        List of results.
-    """
-    results = []
-
-    with ThreadPoolExecutor(max_workers=config.parallel) as executor:
-        # Submit all tasks
-        future_to_task = {
-            executor.submit(
-                _run_single_task,
-                agent,
-                adapter,
-                task,
-                config,
-                trace_collector,
-                live_tracker,
-            ): task
-            for task in tasks
-        }
-
-        # Collect results as they complete
-        for future in as_completed(future_to_task):
-            task = future_to_task[future]
-            try:
-                result = future.result()
-                results.append(result)
-
-                if config.on_task_complete:
-                    config.on_task_complete(result)
-
-                if config.verbose:
-                    status = "SUCCESS" if result.success else "FAIL"
-                    logger.info(f"Task {task.task_id}: {status}")
-
-            except Exception as e:
-                logger.error(f"Task {task.task_id} failed with error: {e}")
-                results.append(
-                    BenchmarkResult(
-                        task_id=task.task_id,
-                        success=False,
-                        score=0.0,
-                        error=str(e),
-                    )
-                )
-
-    return results
-
-
-def _run_single_task(
-    agent: BenchmarkAgent,
-    adapter: BenchmarkAdapter,
-    task: BenchmarkTask,
-    config: EvaluationConfig,
-    trace_collector: ExecutionTraceCollector | None = None,
-    live_tracker: LiveEvaluationTracker | None = None,
-) -> BenchmarkResult:
-    """Run a single task and return result.
-
-    Args:
-        agent: Agent to evaluate.
-        adapter: Benchmark adapter.
-        task: Task to run.
-        config: Evaluation configuration.
-        trace_collector: Optional trace collector for saving execution data.
-        live_tracker: Optional live evaluation tracker.
-
-    Returns:
-        BenchmarkResult.
-    """
-    start_time = time.perf_counter()
-    history: list[tuple[BenchmarkObservation, BenchmarkAction]] = []
-
-    # Start trace collection if enabled
-    if trace_collector is not None:
-        trace_collector.start_task(task)
-
-    # Start live tracking if enabled
-    if live_tracker is not None:
-        live_tracker.start_task(task)
-
-    try:
-        # Reset agent and environment
-        agent.reset()
-        obs = adapter.reset(task)
-
-        done = False
-        steps = 0
-        max_steps = task.time_limit_steps or config.max_steps
-
-        while not done and steps < max_steps:
-            # Get action from agent
-            action = agent.act(obs, task, history if config.save_trajectories else None)
-
-            # Extract reasoning if available from PolicyAgent
-            reasoning = None
-            if hasattr(action, "raw_action") and action.raw_action:
-                reasoning = action.raw_action.get("thought")
-
-            # Record step in trace collector
-            if trace_collector is not None:
-                trace_collector.record_step(steps, obs, action, reasoning)
-
-            # Record step in live tracker
-            if live_tracker is not None:
-                live_tracker.record_step(steps, obs, action, reasoning)
-
-            # Record step in history
-            if config.save_trajectories:
-                history.append((obs, action))
-
-            if config.on_step:
-                config.on_step(obs, action, steps)
-
-            # Check for terminal action
-            if action.type == "done":
-                done = True
-                break
-
-            # Execute action
-            obs, done, info = adapter.step(action)
-            steps += 1
-
-        # Evaluate result
-        result = adapter.evaluate(task)
-
-        # Update result with trajectory info
-        result.steps = history if config.save_trajectories else []
-        result.num_steps = steps
-        result.total_time_seconds = time.perf_counter() - start_time
-
-        # Finish trace collection if enabled
-        if trace_collector is not None:
-            trace_collector.finish_task(result)
-
-        # Finish live tracking if enabled
-        if live_tracker is not None:
-            live_tracker.finish_task(result)
-
-        return result
-
-    except Exception as e:
-        logger.error(f"Error running task {task.task_id}: {e}")
-        result = BenchmarkResult(
-            task_id=task.task_id,
-            success=False,
-            score=0.0,
-            steps=history if config.save_trajectories else [],
-            num_steps=len(history),
-            error=str(e),
-            total_time_seconds=time.perf_counter() - start_time,
-        )
-
-        # Finish trace collection even on error
-        if trace_collector is not None:
-            trace_collector.finish_task(result)
-
-        return result
-
-
-def compute_metrics(results: list[BenchmarkResult]) -> dict:
-    """Compute aggregate metrics from evaluation results.
-
-    Args:
-        results: List of BenchmarkResult from evaluation.
-
-    Returns:
-        Dict with aggregate metrics.
-    """
-    if not results:
-        return {
-            "num_tasks": 0,
-            "success_rate": 0.0,
-            "avg_score": 0.0,
-            "avg_steps": 0.0,
-            "avg_time_seconds": 0.0,
-        }
-
-    num_tasks = len(results)
-    success_count = sum(1 for r in results if r.success)
-    total_score = sum(r.score for r in results)
-    total_steps = sum(r.num_steps for r in results)
-    total_time = sum(r.total_time_seconds for r in results)
-
-    return {
-        "num_tasks": num_tasks,
-        "success_rate": success_count / num_tasks,
-        "avg_score": total_score / num_tasks,
-        "avg_steps": total_steps / num_tasks,
-        "avg_time_seconds": total_time / num_tasks,
-        "success_count": success_count,
-        "fail_count": num_tasks - success_count,
-    }
-
-
-def compute_domain_metrics(
-    results: list[BenchmarkResult], tasks: list[BenchmarkTask]
-) -> dict[str, dict]:
-    """Compute per-domain metrics.
-
-    Args:
-        results: List of BenchmarkResult.
-        tasks: List of BenchmarkTask (to get domain info).
-
-    Returns:
-        Dict mapping domain to metrics dict.
-    """
-    # Build task_id -> domain mapping
-    task_domains = {t.task_id: t.domain for t in tasks}
-
-    # Group results by domain
-    domain_results: dict[str, list[BenchmarkResult]] = {}
-    for result in results:
-        domain = task_domains.get(result.task_id, "unknown")
-        if domain not in domain_results:
-            domain_results[domain] = []
-        domain_results[domain].append(result)
-
-    # Compute metrics per domain
-    return {domain: compute_metrics(res) for domain, res in domain_results.items()}
diff --git a/openadapt_ml/benchmarks/waa.py b/openadapt_ml/benchmarks/waa.py
deleted file mode 100644
index 120a954..0000000
--- a/openadapt_ml/benchmarks/waa.py
+++ /dev/null
@@ -1,831 +0,0 @@
-"""Windows Agent Arena (WAA) benchmark adapter.
-
-This module provides integration with the Windows Agent Arena benchmark,
-enabling evaluation of GUI agents on 154 Windows tasks across 11 domains.
-
-WAA Repository: https://github.com/microsoft/WindowsAgentArena
-
-Example:
-    from openadapt_ml.benchmarks import WAAAdapter, PolicyAgent, evaluate_agent_on_benchmark
-
-    adapter = WAAAdapter(waa_repo_path="/path/to/WindowsAgentArena")
-    agent = PolicyAgent(policy)
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=15)
-    print(f"Success rate: {sum(r.success for r in results) / len(results):.1%}")
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import sys
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any
-
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkAdapter,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-)
-
-logger = logging.getLogger(__name__)
-
-
-# WAA domain mapping (11 domains, 154 tasks)
-WAA_DOMAINS = [
-    "browser",
-    "office",
-    "coding",
-    "media",
-    "notepad",
-    "paint",
-    "file_explorer",
-    "clock",
-    "settings",
-    "edge",
-    "vscode",
-]
-
-
-@dataclass
-class WAAConfig:
-    """Configuration for WAA adapter.
-
-    Attributes:
-        waa_repo_path: Path to cloned WindowsAgentArena repository.
-        use_azure: Whether to use Azure VMs (enables parallelism).
-        observation_type: Type of observation to capture.
-        a11y_backend: Accessibility backend ("uia" or "win32").
-        screen_width: Screen width in pixels.
-        screen_height: Screen height in pixels.
-        max_steps: Default maximum steps per task.
-        action_delay: Delay between actions in seconds.
-    """
-
-    waa_repo_path: str
-    use_azure: bool = False
-    observation_type: str = "screenshot_a11y_tree"  # "screenshot", "a11y_tree", "som"
-    a11y_backend: str = "uia"  # "uia" or "win32"
-    screen_width: int = 1920
-    screen_height: int = 1200
-    max_steps: int = 15
-    action_delay: float = 0.5
-
-
-class WAAAdapter(BenchmarkAdapter):
-    """Windows Agent Arena benchmark adapter.
-
-    Integrates with the WAA benchmark to evaluate GUI agents on 154 Windows
-    desktop automation tasks spanning 11 application domains.
-
-    The adapter wraps WAA's DesktopEnv and provides:
-    - Task loading from WAA's JSON task definitions
-    - VM/environment reset to task initial state
-    - Action execution via WAA's controller
-    - Evaluation using WAA's native evaluators
-
-    Args:
-        waa_repo_path: Path to cloned WindowsAgentArena repository.
-        use_azure: Use Azure VMs for execution (enables parallelism).
-        config: Full WAAConfig (overrides other args if provided).
-        **kwargs: Additional config options passed to WAAConfig.
-
-    Raises:
-        ValueError: If waa_repo_path doesn't exist.
-        ImportError: If WAA dependencies not available.
-    """
-
-    def __init__(
-        self,
-        waa_repo_path: str | Path | None = None,
-        use_azure: bool = False,
-        config: WAAConfig | None = None,
-        **kwargs,
-    ):
-        if config is not None:
-            self.config = config
-        else:
-            if waa_repo_path is None:
-                raise ValueError("waa_repo_path is required")
-            self.config = WAAConfig(
-                waa_repo_path=str(waa_repo_path),
-                use_azure=use_azure,
-                **kwargs,
-            )
-
-        self.waa_repo = Path(self.config.waa_repo_path)
-        if not self.waa_repo.exists():
-            raise ValueError(f"WAA repository not found at: {self.waa_repo}")
-
-        # Paths to WAA components
-        self._client_path = self.waa_repo / "src" / "win-arena-container" / "client"
-        self._tasks_path = self._client_path / "evaluation_examples_windows"
-
-        # Lazy-loaded WAA components
-        self._desktop_env = None
-        self._task_cache: dict[str, BenchmarkTask] = {}
-        self._current_task: BenchmarkTask | None = None
-        self._waa_imported = False
-
-    def _ensure_waa_imported(self) -> None:
-        """Import WAA modules (lazy loading)."""
-        if self._waa_imported:
-            return
-
-        # Add WAA client to path
-        client_path = str(self._client_path)
-        if client_path not in sys.path:
-            sys.path.insert(0, client_path)
-
-        try:
-            # Import WAA's DesktopEnv
-            from desktop_env import DesktopEnv
-
-            self._DesktopEnv = DesktopEnv
-            self._waa_imported = True
-            logger.info("WAA modules imported successfully")
-        except ImportError as e:
-            raise ImportError(
-                f"Failed to import WAA modules. Ensure WAA is properly installed "
-                f"and dependencies are available: {e}"
-            ) from e
-
-    @property
-    def name(self) -> str:
-        """Benchmark name."""
-        return "waa"
-
-    @property
-    def benchmark_type(self) -> str:
-        """Benchmark type (interactive)."""
-        return "interactive"
-
-    @property
-    def supports_parallel(self) -> bool:
-        """Whether parallel execution is supported (requires Azure)."""
-        return self.config.use_azure
-
-    def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
-        """List available WAA tasks.
-
-        WAA has 154 tasks across 11 domains:
-        - browser: Edge/Chrome navigation and settings
-        - office: Word, Excel, Outlook
-        - coding: VSCode, terminal
-        - settings: Windows Settings app
-        - file_explorer: File operations
-        - notepad: Text editing
-        - paint: Drawing operations
-        - media: Video/audio playback
-        - clock: Alarms, timers
-        - edge: Browser-specific
-        - vscode: IDE-specific
-
-        Args:
-            domain: Optional domain filter.
-
-        Returns:
-            List of BenchmarkTask objects.
-        """
-        tasks = self._load_all_tasks()
-
-        if domain is not None:
-            tasks = [t for t in tasks if t.domain == domain]
-
-        return tasks
-
-    def load_task(self, task_id: str) -> BenchmarkTask:
-        """Load a specific task by ID.
-
-        Args:
-            task_id: Task identifier (e.g., "notepad_1", "browser_5").
-
-        Returns:
-            BenchmarkTask object.
-
-        Raises:
-            KeyError: If task_id not found.
-        """
-        if task_id in self._task_cache:
-            return self._task_cache[task_id]
-
-        # Try to load from disk
-        tasks = self._load_all_tasks()
-        task_map = {t.task_id: t for t in tasks}
-
-        if task_id not in task_map:
-            raise KeyError(
-                f"Task '{task_id}' not found. Available: {list(task_map.keys())[:10]}..."
-            )
-
-        return task_map[task_id]
-
-    def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
-        """Reset environment to task's initial state.
-
-        This initializes the Windows VM/desktop to the state required for
-        the task, including opening required applications and setting up
-        any pre-conditions.
-
-        Args:
-            task: Task to initialize.
-
-        Returns:
-            Initial observation (screenshot + accessibility tree).
-        """
-        self._ensure_waa_imported()
-        self._current_task = task
-
-        # Initialize DesktopEnv if needed
-        if self._desktop_env is None:
-            self._desktop_env = self._create_desktop_env()
-
-        # Load task config and reset environment
-        task_config = self._load_waa_task_config(task)
-        obs = self._desktop_env.reset(task_config=task_config)
-
-        return self._to_benchmark_observation(obs)
-
-    def step(
-        self, action: BenchmarkAction
-    ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
-        """Execute action and return new observation.
-
-        Args:
-            action: Action to execute.
-
-        Returns:
-            Tuple of (observation, done, info).
-        """
-        if self._desktop_env is None:
-            raise RuntimeError("Call reset() before step()")
-
-        # Convert to WAA action format
-        waa_action = self._to_waa_action(action)
-
-        # Execute action
-        obs, reward, done, info = self._desktop_env.step(waa_action)
-
-        # Optional delay between actions
-        if self.config.action_delay > 0:
-            time.sleep(self.config.action_delay)
-
-        return self._to_benchmark_observation(obs), done, info
-
-    def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Run WAA's native evaluation on current state.
-
-        WAA evaluators check the actual OS state (files, settings, app state)
-        to determine if the task was completed successfully.
-
-        Args:
-            task: Task to evaluate.
-
-        Returns:
-            BenchmarkResult with success/score.
-        """
-        if self._desktop_env is None:
-            raise RuntimeError("Call reset() and step() before evaluate()")
-
-        # Run WAA's evaluator
-        try:
-            result = self._desktop_env.evaluate()
-            success = result.get("success", False)
-            score = 1.0 if success else 0.0
-            reason = result.get("reason", None)
-        except Exception as e:
-            logger.error(f"Evaluation failed for task {task.task_id}: {e}")
-            success = False
-            score = 0.0
-            reason = str(e)
-
-        return BenchmarkResult(
-            task_id=task.task_id,
-            success=success,
-            score=score,
-            reason=reason,
-        )
-
-    def close(self) -> None:
-        """Clean up resources."""
-        if self._desktop_env is not None:
-            try:
-                self._desktop_env.close()
-            except Exception as e:
-                logger.warning(f"Error closing DesktopEnv: {e}")
-            self._desktop_env = None
-
-    def _create_desktop_env(self):
-        """Create WAA DesktopEnv instance."""
-        require_a11y = self.config.observation_type in [
-            "a11y_tree",
-            "screenshot_a11y_tree",
-            "som",
-        ]
-
-        return self._DesktopEnv(
-            screen_size=(self.config.screen_width, self.config.screen_height),
-            require_a11y_tree=require_a11y,
-            a11y_backend=self.config.a11y_backend,
-        )
-
-    def _load_all_tasks(self) -> list[BenchmarkTask]:
-        """Load all WAA tasks from the repository."""
-        if self._task_cache:
-            return list(self._task_cache.values())
-
-        tasks = []
-
-        # Load test_all.json metadata
-        meta_path = self._tasks_path / "test_all.json"
-        if meta_path.exists():
-            with open(meta_path, encoding="utf-8") as f:
-                meta = json.load(f)
-
-            for domain, task_ids in meta.items():
-                if domain in WAA_DOMAINS:
-                    for task_id in task_ids:
-                        task = self._load_task_from_json(domain, task_id)
-                        if task:
-                            tasks.append(task)
-                            self._task_cache[task.task_id] = task
-        else:
-            # Fallback: scan examples directory
-            examples_dir = self._tasks_path / "examples"
-            if examples_dir.exists():
-                for domain_dir in examples_dir.iterdir():
-                    if domain_dir.is_dir() and domain_dir.name in WAA_DOMAINS:
-                        for task_file in domain_dir.glob("*.json"):
-                            task = self._load_task_from_file(task_file, domain_dir.name)
-                            if task:
-                                tasks.append(task)
-                                self._task_cache[task.task_id] = task
-
-        logger.info(f"Loaded {len(tasks)} WAA tasks")
-        return tasks
-
-    def _load_task_from_json(self, domain: str, task_id: str) -> BenchmarkTask | None:
-        """Load a task from its JSON file."""
-        task_file = self._tasks_path / "examples" / domain / f"{task_id}.json"
-        if not task_file.exists():
-            logger.warning(f"Task file not found: {task_file}")
-            return None
-
-        return self._load_task_from_file(task_file, domain)
-
-    def _load_task_from_file(
-        self, task_file: Path, domain: str
-    ) -> BenchmarkTask | None:
-        """Load a task from a JSON file."""
-        try:
-            with open(task_file, encoding="utf-8") as f:
-                config = json.load(f)
-
-            task_id = f"{domain}_{task_file.stem}"
-            instruction = config.get("instruction", config.get("task", ""))
-
-            return BenchmarkTask(
-                task_id=task_id,
-                instruction=instruction,
-                domain=domain,
-                initial_state_ref=config.get("snapshot", None),
-                time_limit_steps=config.get("max_steps", self.config.max_steps),
-                raw_config=config,
-                evaluation_spec=config.get("evaluation", None),
-            )
-        except Exception as e:
-            logger.warning(f"Failed to load task from {task_file}: {e}")
-            return None
-
-    def _load_waa_task_config(self, task: BenchmarkTask) -> dict:
-        """Convert BenchmarkTask to WAA's task config format."""
-        return task.raw_config
-
-    def _to_benchmark_observation(self, waa_obs: dict | Any) -> BenchmarkObservation:
-        """Convert WAA observation to canonical format.
-
-        WAA observations may include:
-        - screenshot: PIL Image or bytes
-        - a11y_tree: UIA accessibility tree dict
-        - window_title: Active window title
-        """
-        # Handle different WAA observation formats
-        if isinstance(waa_obs, dict):
-            screenshot = waa_obs.get("screenshot")
-            a11y_tree = waa_obs.get("a11y_tree", waa_obs.get("accessibility_tree"))
-            window_title = waa_obs.get("window_title")
-            raw_obs = waa_obs
-        else:
-            # WAA may return observation as object with attributes
-            screenshot = getattr(waa_obs, "screenshot", None)
-            a11y_tree = getattr(waa_obs, "a11y_tree", None)
-            window_title = getattr(waa_obs, "window_title", None)
-            raw_obs = {"waa_obs_type": type(waa_obs).__name__}
-
-        # Convert PIL Image to bytes if needed
-        screenshot_bytes = None
-        if screenshot is not None:
-            if hasattr(screenshot, "tobytes"):
-                # PIL Image - convert to PNG bytes
-                import io
-
-                buf = io.BytesIO()
-                screenshot.save(buf, format="PNG")
-                screenshot_bytes = buf.getvalue()
-            elif isinstance(screenshot, bytes):
-                screenshot_bytes = screenshot
-
-        return BenchmarkObservation(
-            screenshot=screenshot_bytes,
-            viewport=(self.config.screen_width, self.config.screen_height),
-            accessibility_tree=a11y_tree,
-            window_title=window_title,
-            raw_observation=raw_obs,
-        )
-
-    def _to_waa_action(self, action: BenchmarkAction) -> dict:
-        """Convert canonical action to WAA format.
-
-        WAA action format:
-        - click: {"action_type": "click", "coordinate": [x, y]}
-        - double_click: {"action_type": "double_click", "coordinate": [x, y]}
-        - type: {"action_type": "type", "text": "..."}
-        - key: {"action_type": "key", "key": "...", "modifiers": [...]}
-        - scroll: {"action_type": "scroll", "direction": "...", "amount": ...}
-        - drag: {"action_type": "drag", "start": [x, y], "end": [x, y]}
-        """
-        action_type = action.type
-
-        # Map canonical action types to WAA format
-        if action_type == "click":
-            x = action.x or 0
-            y = action.y or 0
-            # Convert normalized coords to pixels if needed
-            if 0 <= x <= 1 and 0 <= y <= 1:
-                x = int(x * self.config.screen_width)
-                y = int(y * self.config.screen_height)
-            return {
-                "action_type": "click",
-                "coordinate": [int(x), int(y)],
-            }
-
-        elif action_type == "double_click":
-            x = action.x or 0
-            y = action.y or 0
-            if 0 <= x <= 1 and 0 <= y <= 1:
-                x = int(x * self.config.screen_width)
-                y = int(y * self.config.screen_height)
-            return {
-                "action_type": "double_click",
-                "coordinate": [int(x), int(y)],
-            }
-
-        elif action_type == "right_click":
-            x = action.x or 0
-            y = action.y or 0
-            if 0 <= x <= 1 and 0 <= y <= 1:
-                x = int(x * self.config.screen_width)
-                y = int(y * self.config.screen_height)
-            return {
-                "action_type": "right_click",
-                "coordinate": [int(x), int(y)],
-            }
-
-        elif action_type == "type":
-            return {
-                "action_type": "type",
-                "text": action.text or "",
-            }
-
-        elif action_type == "key":
-            waa_action = {
-                "action_type": "key",
-                "key": action.key or "",
-            }
-            if action.modifiers:
-                waa_action["modifiers"] = action.modifiers
-            return waa_action
-
-        elif action_type == "scroll":
-            return {
-                "action_type": "scroll",
-                "direction": action.scroll_direction or "down",
-                "amount": action.scroll_amount or 3,  # Default scroll amount
-            }
-
-        elif action_type == "drag":
-            x1 = action.x or 0
-            y1 = action.y or 0
-            x2 = action.end_x or 0
-            y2 = action.end_y or 0
-            # Convert normalized coords
-            if 0 <= x1 <= 1:
-                x1 = int(x1 * self.config.screen_width)
-                y1 = int(y1 * self.config.screen_height)
-            if 0 <= x2 <= 1:
-                x2 = int(x2 * self.config.screen_width)
-                y2 = int(y2 * self.config.screen_height)
-            return {
-                "action_type": "drag",
-                "start": [int(x1), int(y1)],
-                "end": [int(x2), int(y2)],
-            }
-
-        elif action_type == "done":
-            return {"action_type": "done"}
-
-        elif action_type == "wait":
-            return {"action_type": "wait"}
-
-        else:
-            logger.warning(f"Unknown action type: {action_type}")
-            return {"action_type": action_type, "raw": action.raw_action}
-
-
-class WAAMockAdapter(BenchmarkAdapter):
-    """Mock WAA adapter for testing without Windows VM.
-
-    Useful for:
-    - Testing the benchmark integration without actual WAA
-    - Development on non-Windows platforms
-    - Unit tests
-
-    Args:
-        num_tasks: Number of mock tasks to generate.
-        domains: Domains to include in mock tasks.
-    """
-
-    def __init__(
-        self,
-        num_tasks: int = 20,
-        domains: list[str] | None = None,
-    ):
-        self._num_tasks = num_tasks
-        self._domains = domains or WAA_DOMAINS[:3]  # Default to first 3 domains
-        self._tasks: list[BenchmarkTask] = []
-        self._current_task: BenchmarkTask | None = None
-        self._step_count = 0
-        self._temp_dir: Path | None = None
-        self._actions: list[BenchmarkAction] = []  # Track actions for evaluation
-        self._text_entered: str | None = None  # Track typed text
-        self._generate_mock_tasks()
-
-    @property
-    def name(self) -> str:
-        return "waa-mock"
-
-    @property
-    def benchmark_type(self) -> str:
-        return "interactive"
-
-    def _generate_mock_tasks(self) -> None:
-        """Generate mock tasks for testing."""
-        tasks_per_domain = self._num_tasks // len(self._domains)
-        extra = self._num_tasks % len(self._domains)
-
-        for i, domain in enumerate(self._domains):
-            count = tasks_per_domain + (1 if i < extra else 0)
-            for j in range(count):
-                task_id = f"{domain}_{j + 1}"
-                self._tasks.append(
-                    BenchmarkTask(
-                        task_id=task_id,
-                        instruction=f"Mock task {j + 1} in {domain} domain",
-                        domain=domain,
-                        time_limit_steps=15,
-                        raw_config={"mock": True},
-                    )
-                )
-
-    def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
-        if domain is not None:
-            return [t for t in self._tasks if t.domain == domain]
-        return self._tasks
-
-    def load_task(self, task_id: str) -> BenchmarkTask:
-        for task in self._tasks:
-            if task.task_id == task_id:
-                return task
-        raise KeyError(f"Task '{task_id}' not found")
-
-    def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
-        self._current_task = task
-        self._step_count = 0
-        self._actions = []  # Clear action history
-        self._text_entered = None
-        return self._mock_observation()
-
-    def step(
-        self, action: BenchmarkAction
-    ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
-        self._step_count += 1
-        self._actions.append(action)  # Track action for evaluation
-
-        # Track typed text
-        if action.type == "type" and action.text:
-            self._text_entered = action.text
-
-        done = action.type == "done" or self._step_count >= 15
-        return self._mock_observation(), done, {"step": self._step_count}
-
-    def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Evaluate task based on actions taken.
-
-        Success criteria for mock tasks:
-        - Agent clicked the Submit button (ID 4) OR
-        - Agent typed text AND clicked OK (ID 1) OR
-        - Agent completed with DONE action after meaningful interaction
-
-        This provides deterministic evaluation based on actual agent behavior,
-        not random chance. The mock UI has:
-        - ID 1: OK button
-        - ID 2: Text input field
-        - ID 3: Cancel button
-        - ID 4: Submit button
-        """
-        # Check what actions were taken
-        clicked_ids = set()
-        typed_text = False
-        called_done = False
-
-        for action in self._actions:
-            if action.type == "click":
-                # Extract target node ID from action
-                target_id = getattr(action, "target_node_id", None)
-                if target_id:
-                    clicked_ids.add(str(target_id))
-            elif action.type == "type" and action.text:
-                typed_text = True
-            elif action.type == "done":
-                called_done = True
-
-        # Success criteria:
-        # 1. Clicked Submit (ID 4) - primary success path
-        # 2. Typed something AND clicked OK (ID 1) - form submission path
-        # 3. Called DONE after at least 2 actions - reasonable completion
-        clicked_submit = "4" in clicked_ids
-        clicked_ok = "1" in clicked_ids
-        form_submitted = typed_text and clicked_ok
-        reasonable_completion = called_done and len(self._actions) >= 2
-
-        success = clicked_submit or form_submitted or reasonable_completion
-
-        # Calculate partial credit score
-        score = 0.0
-        if success:
-            score = 1.0
-        elif typed_text or clicked_ids:
-            # Partial credit for taking meaningful actions
-            score = (
-                0.3 + (0.1 * min(len(clicked_ids), 3)) + (0.2 if typed_text else 0.0)
-            )
-
-        return BenchmarkResult(
-            task_id=task.task_id,
-            success=success,
-            score=score,
-            num_steps=self._step_count,
-            reason=f"clicked={list(clicked_ids)}, typed={typed_text}, done={called_done}",
-        )
-
-    def _mock_observation(self) -> BenchmarkObservation:
-        """Generate a mock observation with a real screenshot file."""
-        import tempfile
-
-        # Create temp directory if needed
-        if self._temp_dir is None:
-            self._temp_dir = Path(tempfile.mkdtemp(prefix="waa_mock_"))
-
-        # Generate a simple mock screenshot (gray image with text)
-        screenshot_path = self._temp_dir / f"mock_step_{self._step_count}.png"
-        self._generate_mock_screenshot(screenshot_path)
-
-        return BenchmarkObservation(
-            screenshot=screenshot_path.read_bytes(),
-            screenshot_path=str(screenshot_path),
-            viewport=(1920, 1200),
-            accessibility_tree={
-                "role": "window",
-                "name": "Mock Window",
-                "children": [
-                    {"role": "button", "name": "OK", "id": "1"},
-                    {"role": "textfield", "name": "Input", "id": "2"},
-                    {"role": "button", "name": "Cancel", "id": "3"},
-                    {"role": "button", "name": "Submit", "id": "4"},
-                ],
-            },
-            window_title="Mock Window - Testing",
-        )
-
-    def _generate_mock_screenshot(self, path: Path) -> None:
-        """Generate a simple mock screenshot image."""
-        try:
-            from PIL import Image, ImageDraw
-
-            # Create a simple gray image with some UI elements
-            img = Image.new("RGB", (1920, 1200), color=(240, 240, 240))
-            draw = ImageDraw.Draw(img)
-
-            # Draw a title bar
-            draw.rectangle([0, 0, 1920, 40], fill=(60, 60, 60))
-            draw.text((20, 10), "Mock Application Window", fill=(255, 255, 255))
-
-            # Draw some buttons
-            draw.rectangle([100, 100, 200, 140], fill=(0, 120, 215))
-            draw.text((120, 110), "OK", fill=(255, 255, 255))
-
-            draw.rectangle([220, 100, 320, 140], fill=(200, 200, 200))
-            draw.text((240, 110), "Cancel", fill=(0, 0, 0))
-
-            # Draw a text field
-            draw.rectangle([100, 160, 500, 200], outline=(100, 100, 100))
-            draw.text((110, 170), "Enter text here...", fill=(150, 150, 150))
-
-            # Draw task instruction
-            task_name = self._current_task.task_id if self._current_task else "Unknown"
-            draw.text((100, 250), f"Task: {task_name}", fill=(0, 0, 0))
-            draw.text((100, 280), f"Step: {self._step_count}", fill=(0, 0, 0))
-
-            img.save(path)
-        except ImportError:
-            # Fallback: create a minimal valid PNG if PIL not available
-            # This is a 1x1 gray PNG
-            minimal_png = bytes(
-                [
-                    0x89,
-                    0x50,
-                    0x4E,
-                    0x47,
-                    0x0D,
-                    0x0A,
-                    0x1A,
-                    0x0A,  # PNG signature
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x0D,
-                    0x49,
-                    0x48,
-                    0x44,
-                    0x52,  # IHDR chunk
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x01,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x01,
-                    0x08,
-                    0x02,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x90,
-                    0x77,
-                    0x53,
-                    0xDE,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x0C,
-                    0x49,
-                    0x44,
-                    0x41,  # IDAT chunk
-                    0x54,
-                    0x08,
-                    0xD7,
-                    0x63,
-                    0xF8,
-                    0xCF,
-                    0xC0,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x03,
-                    0x00,
-                    0x01,
-                    0x00,
-                    0x05,
-                    0xFE,
-                    0xD4,
-                    0xEF,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x49,
-                    0x45,  # IEND chunk
-                    0x4E,
-                    0x44,
-                    0xAE,
-                    0x42,
-                    0x60,
-                    0x82,
-                ]
-            )
-            path.write_bytes(minimal_png)
diff --git a/openadapt_ml/benchmarks/waa_deploy/Dockerfile b/openadapt_ml/benchmarks/waa_deploy/Dockerfile
index 02d0817..b5078f8 100644
--- a/openadapt_ml/benchmarks/waa_deploy/Dockerfile
+++ b/openadapt_ml/benchmarks/waa_deploy/Dockerfile
@@ -83,16 +83,15 @@ RUN find /client -name "*.py" -exec sed -i 's|20.20.20.21|172.30.0.2|g' {} \; &&
     echo "Patched client Python files"
 
 # -----------------------------------------------------------------------------
-# Add API-backed agent support (Claude Sonnet 4.5 / GPT-5.1)
-# This allows using --agent api-claude or --agent api-openai instead of navi
+# Add API-backed agent support (Claude / OpenAI)
+# NOTE: API agents (api-claude, api-openai) are run EXTERNALLY via openadapt-evals CLI
+# which connects to the WAA server over SSH tunnel. No internal patching needed.
+# The api_agent.py is included for reference/future use.
 # -----------------------------------------------------------------------------
 
-# Copy api_agent.py to the client mm_agents directory
+# Copy api_agent.py for reference (used externally by openadapt-evals)
 COPY api_agent.py /client/mm_agents/api_agent.py
 
-# Note: API agent patching (api-claude, api-openai) skipped for now
-# The navi agent works out of the box - API agents can be added later
-
 # -----------------------------------------------------------------------------
 # Fix Windows setup for automation
 # -----------------------------------------------------------------------------
diff --git a/openadapt_ml/benchmarks/waa_live.py b/openadapt_ml/benchmarks/waa_live.py
deleted file mode 100644
index 4dfd0d4..0000000
--- a/openadapt_ml/benchmarks/waa_live.py
+++ /dev/null
@@ -1,623 +0,0 @@
-"""Windows Agent Arena Live adapter.
-
-This module provides a live HTTP-based adapter for WAA that connects to the
-WAA Flask server running inside a Windows VM. Unlike WAAAdapter which imports
-WAA's DesktopEnv locally, this adapter talks to the server remotely.
-
-Architecture:
-    The adapter uses WAA's element-based execution model:
-    1. Fetch accessibility tree from /accessibility endpoint
-    2. Extract element bboxes and POST to /update_computer as rects dict
-    3. Agent outputs actions with target_node_id (element-based grounding)
-    4. Execute via /execute_windows using computer.mouse.move_id(id) commands
-
-    This keeps grounding authority on WAA side - we send element IDs,
-    not pixel coordinates. WAA's Computer class handles the grounding.
-
-Example:
-    from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
-
-    adapter = WAALiveAdapter(WAALiveConfig(server_url="http://vm-ip:5000"))
-    agent = DemoConditionedAgent(base_agent, retriever)
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=15)
-"""
-
-from __future__ import annotations
-
-import base64
-import logging
-import time
-from dataclasses import dataclass
-from typing import Any
-
-import requests
-
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkAdapter,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-)
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class WAALiveConfig:
-    """Configuration for WAALiveAdapter.
-
-    Attributes:
-        server_url: URL of WAA Flask server (e.g., "http://172.171.112.41:5000").
-        a11y_backend: Accessibility backend ("uia" or "win32").
-        screen_width: Screen width in pixels.
-        screen_height: Screen height in pixels.
-        max_steps: Default maximum steps per task.
-        action_delay: Delay after actions in seconds (for UI to settle).
-        timeout: Request timeout in seconds.
-    """
-
-    server_url: str = "http://localhost:5000"
-    a11y_backend: str = "uia"
-    screen_width: int = 1920
-    screen_height: int = 1200
-    max_steps: int = 15
-    action_delay: float = 0.5
-    timeout: float = 90.0
-
-
-class WAALiveAdapter(BenchmarkAdapter):
-    """Live WAA adapter that connects to WAA Flask server over HTTP.
-
-    Unlike WAAAdapter which imports WAA's DesktopEnv locally, this adapter
-    talks to the WAA server remotely via HTTP. This enables:
-    - Running DemoConditionedAgent from local machine
-    - Using our own VLM (Claude/GPT) instead of WAA's built-in navi agent
-    - Injecting demos into prompts before each action
-
-    Args:
-        config: WAALiveConfig with server URL and settings.
-    """
-
-    def __init__(self, config: WAALiveConfig | None = None):
-        self.config = config or WAALiveConfig()
-        self._current_task: BenchmarkTask | None = None
-        self._step_count = 0
-        self._current_a11y: dict | None = None
-        self._current_rects: dict[str, list[int]] = {}  # element_id -> [l, t, r, b]
-        self._current_screenshot: bytes | None = None
-        self._actions: list[BenchmarkAction] = []
-
-    @property
-    def name(self) -> str:
-        """Benchmark name."""
-        return "waa-live"
-
-    @property
-    def benchmark_type(self) -> str:
-        """Benchmark type (interactive)."""
-        return "interactive"
-
-    @property
-    def supports_parallel(self) -> bool:
-        """Whether parallel execution is supported."""
-        return False  # Single VM for now
-
-    def check_connection(self) -> bool:
-        """Check if WAA server is reachable.
-
-        Returns:
-            True if server responds to /probe endpoint.
-        """
-        try:
-            resp = requests.get(f"{self.config.server_url}/probe", timeout=5.0)
-            return resp.status_code == 200
-        except requests.RequestException:
-            return False
-
-    def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
-        """List available WAA tasks.
-
-        For live adapter, tasks are typically loaded on-demand.
-        Returns empty list - use load_task() directly.
-        """
-        return []
-
-    def load_task(self, task_id: str) -> BenchmarkTask:
-        """Load a specific task by ID.
-
-        Args:
-            task_id: Task identifier.
-
-        Returns:
-            BenchmarkTask object.
-        """
-        # For now, create a minimal task - actual task configs should be
-        # loaded from WAA repo if needed
-        return BenchmarkTask(
-            task_id=task_id,
-            instruction=f"Task {task_id}",
-            domain=task_id.split("_")[0] if "_" in task_id else "unknown",
-            time_limit_steps=self.config.max_steps,
-        )
-
-    def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
-        """Reset environment to task's initial state.
-
-        Args:
-            task: Task to initialize.
-
-        Returns:
-            Initial observation (screenshot + accessibility tree).
-
-        Raises:
-            RuntimeError: If server is not reachable.
-        """
-        if not self.check_connection():
-            raise RuntimeError(
-                f"Cannot connect to WAA server at {self.config.server_url}. "
-                f"Ensure Windows VM is running and server is started."
-            )
-
-        self._current_task = task
-        self._step_count = 0
-        self._actions = []
-
-        # Try to close all windows for clean state
-        try:
-            requests.post(f"{self.config.server_url}/setup/close_all", timeout=30.0)
-            logger.info("Closed all windows for clean state")
-        except requests.RequestException as e:
-            logger.warning(f"Failed to close windows: {e}")
-
-        # If task has setup commands in raw_config, execute them
-        if task.raw_config:
-            self._run_task_setup(task.raw_config)
-
-        # Small delay for UI to settle
-        time.sleep(1.0)
-
-        return self._get_observation()
-
-    def step(
-        self, action: BenchmarkAction
-    ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
-        """Execute action and return new observation.
-
-        Uses element-based grounding via WAA's Computer class. Click actions
-        are translated to computer.mouse.move_id(id) commands that WAA executes
-        using the rects we POSTed to /update_computer.
-
-        Args:
-            action: Action to execute.
-
-        Returns:
-            Tuple of (observation, done, info).
-        """
-        self._step_count += 1
-        self._actions.append(action)
-
-        # Translate action to element-based command for WAA's Computer
-        command = self._translate_action(action)
-
-        # Execute command via /execute_windows (has access to computer object)
-        if command:
-            try:
-                resp = requests.post(
-                    f"{self.config.server_url}/execute_windows",
-                    json={"command": command},
-                    timeout=self.config.timeout,
-                )
-                if resp.status_code != 200:
-                    logger.error(f"Execute failed ({resp.status_code}): {resp.text}")
-                else:
-                    result = resp.json()
-                    if result.get("stderr"):
-                        logger.warning(f"Command stderr: {result['stderr']}")
-                    logger.debug(f"Executed: {command}")
-            except requests.RequestException as e:
-                logger.error(f"Execute request failed: {e}")
-
-        # Wait for UI to settle
-        time.sleep(self.config.action_delay)
-
-        # Check if done
-        done = action.type == "done" or self._step_count >= self.config.max_steps
-
-        obs = self._get_observation()
-        info = {
-            "step": self._step_count,
-            "command": command,
-        }
-
-        return obs, done, info
-
-    def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Evaluate current state against task success criteria.
-
-        For live adapter, full evaluation requires running WAA's evaluators.
-        Currently returns a placeholder result.
-
-        Args:
-            task: Task to evaluate.
-
-        Returns:
-            BenchmarkResult with success/score.
-        """
-        # TODO: Implement proper evaluation by calling WAA evaluators
-        # For now, check if agent took any actions
-        has_actions = len(self._actions) > 0
-        called_done = any(a.type == "done" for a in self._actions)
-
-        return BenchmarkResult(
-            task_id=task.task_id,
-            success=False,  # Can't determine without evaluator
-            score=0.5 if has_actions and called_done else 0.0,
-            num_steps=self._step_count,
-            reason="Evaluation requires WAA evaluators (not yet implemented)",
-        )
-
-    def close(self) -> None:
-        """Clean up resources."""
-        self._current_task = None
-        self._current_a11y = None
-        self._actions = []
-
-    def _get_observation(self) -> BenchmarkObservation:
-        """Fetch current observation from WAA server.
-
-        Also extracts element rects from a11y tree and updates WAA's Computer
-        so element-based grounding works for subsequent actions.
-
-        Returns:
-            BenchmarkObservation with screenshot and accessibility tree.
-        """
-        screenshot = None
-        a11y_tree = None
-
-        # Get screenshot
-        try:
-            resp = requests.get(f"{self.config.server_url}/screenshot", timeout=30.0)
-            if resp.status_code == 200:
-                screenshot = resp.content
-                self._current_screenshot = screenshot
-                logger.debug(f"Got screenshot: {len(screenshot)} bytes")
-            else:
-                logger.warning(f"Screenshot request failed: {resp.status_code}")
-        except requests.RequestException as e:
-            logger.error(f"Screenshot request error: {e}")
-
-        # Get accessibility tree
-        try:
-            resp = requests.get(
-                f"{self.config.server_url}/accessibility",
-                params={"backend": self.config.a11y_backend},
-                timeout=30.0,
-            )
-            if resp.status_code == 200:
-                result = resp.json()
-                a11y_tree = result.get("AT", {})
-                self._current_a11y = a11y_tree
-                # Extract rects for element-based grounding
-                self._current_rects = self._extract_rects_from_a11y(a11y_tree)
-                logger.debug(
-                    "Got accessibility tree with %d elements", len(self._current_rects)
-                )
-            else:
-                logger.warning(f"A11y request failed: {resp.status_code}")
-        except requests.RequestException as e:
-            logger.error(f"A11y request error: {e}")
-
-        # Update WAA's Computer with current rects for element grounding
-        if self._current_rects:
-            self._update_waa_computer()
-
-        return BenchmarkObservation(
-            screenshot=screenshot,
-            viewport=(self.config.screen_width, self.config.screen_height),
-            accessibility_tree=a11y_tree,
-            window_title=self._extract_window_title(a11y_tree),
-        )
-
-    def _extract_window_title(self, a11y_tree: dict | str | None) -> str | None:
-        """Extract window title from accessibility tree."""
-        if not a11y_tree:
-            return None
-        # Handle XML string - can't extract title easily
-        if isinstance(a11y_tree, str):
-            return None
-        # Try common field names
-        for key in ["Name", "name", "title", "Title"]:
-            if key in a11y_tree:
-                return a11y_tree[key]
-        return None
-
-    def _extract_rects_from_a11y(self, a11y_tree: dict | None) -> dict[str, list[int]]:
-        """Extract element ID -> bounding box mapping from accessibility tree.
-
-        This produces the `rects` dict that WAA's Computer class expects.
-        The rects are then POSTed to /update_computer so WAA can handle grounding.
-
-        Args:
-            a11y_tree: Accessibility tree from /accessibility endpoint.
-
-        Returns:
-            Dict mapping element IDs to [left, top, right, bottom] bounding boxes.
-        """
-        rects: dict[str, list[int]] = {}
-
-        def visit(node: dict) -> None:
-            # Get element ID
-            elem_id = None
-            for id_field in ["id", "Id", "ID", "AutomationId"]:
-                if id_field in node and node[id_field]:
-                    elem_id = str(node[id_field])
-                    break
-
-            # Get bounding box
-            bbox = None
-            for bbox_field in ["bbox", "BoundingRectangle", "Rect", "rect"]:
-                if bbox_field in node:
-                    bbox = node[bbox_field]
-                    break
-
-            # Store if we have both ID and bbox
-            if elem_id is not None and bbox is not None:
-                # Normalize bbox to [left, top, right, bottom]
-                if isinstance(bbox, list) and len(bbox) == 4:
-                    # Could be [l, t, r, b] or [l, t, w, h] - assume [l, t, r, b]
-                    rects[elem_id] = [int(x) for x in bbox]
-                elif isinstance(bbox, dict):
-                    x = bbox.get("x", 0)
-                    y = bbox.get("y", 0)
-                    w = bbox.get("width", 0)
-                    h = bbox.get("height", 0)
-                    rects[elem_id] = [x, y, x + w, y + h]
-                elif isinstance(bbox, str):
-                    parts = [int(p) for p in bbox.split(",")]
-                    if len(parts) == 4:
-                        rects[elem_id] = parts
-
-            # Visit children
-            for child_field in ["children", "Children"]:
-                children = node.get(child_field, [])
-                if isinstance(children, list):
-                    for child in children:
-                        if isinstance(child, dict):
-                            visit(child)
-
-        if a11y_tree:
-            # Handle case where a11y_tree is XML string (WAA returns XML)
-            if isinstance(a11y_tree, str):
-                # TODO: Parse XML to dict if needed for element grounding
-                logger.debug("A11y tree is XML string, skipping rect extraction")
-                return rects
-            visit(a11y_tree)
-
-        logger.debug(f"Extracted {len(rects)} element rects from a11y tree")
-        return rects
-
-    def _update_waa_computer(self) -> None:
-        """POST current rects and screenshot to WAA's /update_computer endpoint.
-
-        This syncs WAA's Computer object with our current element state,
-        allowing computer.mouse.move_id(id) to work correctly.
-        """
-        if not self._current_rects:
-            logger.warning("No rects to update - skipping /update_computer")
-            return
-
-        # Encode screenshot as base64
-        screenshot_b64 = ""
-        if self._current_screenshot:
-            screenshot_b64 = base64.b64encode(self._current_screenshot).decode("utf-8")
-
-        # Window rect (full screen for now)
-        window_rect = [0, 0, self.config.screen_width, self.config.screen_height]
-
-        payload = {
-            "rects": self._current_rects,
-            "window_rect": window_rect,
-            "screenshot": screenshot_b64,
-            "scale": [1.0, 1.0],
-        }
-
-        try:
-            resp = requests.post(
-                f"{self.config.server_url}/update_computer", json=payload, timeout=30.0
-            )
-            if resp.status_code == 200:
-                logger.debug(
-                    "Updated WAA computer with %d rects", len(self._current_rects)
-                )
-            else:
-                logger.warning(
-                    f"update_computer failed: {resp.status_code} - {resp.text}"
-                )
-        except requests.RequestException as e:
-            logger.error(f"update_computer request error: {e}")
-
-    def _run_task_setup(self, raw_config: dict) -> None:
-        """Run task setup commands from raw_config.
-
-        Args:
-            raw_config: Task configuration with setup commands.
-        """
-        # Handle different setup command formats
-        setup = raw_config.get("setup", raw_config.get("init", {}))
-
-        if isinstance(setup, dict):
-            # Launch application if specified
-            if "app" in setup or "application" in setup:
-                app = setup.get("app") or setup.get("application")
-                try:
-                    requests.post(
-                        f"{self.config.server_url}/setup/launch",
-                        json={"app": app},
-                        timeout=30.0,
-                    )
-                    logger.info(f"Launched app: {app}")
-                except requests.RequestException as e:
-                    logger.warning(f"Failed to launch app: {e}")
-
-            # Run shell commands if specified
-            if "commands" in setup:
-                for cmd in setup["commands"]:
-                    try:
-                        requests.post(
-                            f"{self.config.server_url}/execute_windows",
-                            json={"command": cmd, "shell": "powershell"},
-                            timeout=60.0,
-                        )
-                        logger.info(f"Ran setup command: {cmd[:50]}...")
-                    except requests.RequestException as e:
-                        logger.warning(f"Setup command failed: {e}")
-
-    def _translate_action(self, action: BenchmarkAction) -> str | None:
-        """Translate BenchmarkAction to element-based command for WAA's Computer.
-
-        Uses WAA's Computer class via /execute_windows endpoint. Click actions
-        use computer.mouse.move_id(id) for element-based grounding - the actual
-        coordinates are resolved by WAA's Computer class using the rects we
-        POSTed to /update_computer.
-
-        Args:
-            action: The action to translate.
-
-        Returns:
-            Python command string to execute via /execute_windows endpoint,
-            or None for actions that don't need execution.
-        """
-        if action.type == "done":
-            return None
-
-        if action.type == "wait":
-            return "import time; time.sleep(1)"
-
-        if action.type == "click":
-            return self._translate_click_action(action, "single_click")
-
-        if action.type == "double_click":
-            return self._translate_click_action(action, "double_click")
-
-        if action.type == "right_click":
-            return self._translate_click_action(action, "right_click")
-
-        if action.type == "type":
-            text = action.text or ""
-            # Escape special characters
-            text = text.replace("\\", "\\\\").replace("'", "\\'")
-            # Use pyautogui for typing (no grounding needed)
-            return f"import pyautogui; pyautogui.write('{text}', interval=0.02)"
-
-        if action.type == "key":
-            return self._translate_key_action(action)
-
-        if action.type == "scroll":
-            direction = action.scroll_direction or "down"
-            return f"computer.mouse.scroll('{direction}')"
-
-        if action.type == "drag":
-            # Drag requires start and end - use element IDs or coordinates
-            if action.target_node_id is not None:
-                elem_id = str(action.target_node_id)
-                if elem_id in self._current_rects:
-                    # Start at element, drag to end coords
-                    end_x = action.end_x or 0
-                    end_y = action.end_y or 0
-                    if isinstance(end_x, float) and 0 <= end_x <= 1:
-                        end_x = int(end_x * self.config.screen_width)
-                    if isinstance(end_y, float) and 0 <= end_y <= 1:
-                        end_y = int(end_y * self.config.screen_height)
-                    return (
-                        f"computer.mouse.move_id('{elem_id}'); "
-                        f"computer.mouse.drag({int(end_x)}, {int(end_y)})"
-                    )
-            logger.warning("Drag requires target_node_id with valid element")
-            return None
-
-        logger.warning(f"Unknown action type: {action.type}")
-        return None
-
-    def _translate_click_action(
-        self, action: BenchmarkAction, click_method: str
-    ) -> str:
-        """Translate click-type action to element-based command.
-
-        Args:
-            action: The click action.
-            click_method: "single_click", "double_click", or "right_click".
-
-        Returns:
-            Python command string using computer.mouse.move_id() for grounding.
-        """
-        # Prefer element ID for grounding (SoM mode)
-        if action.target_node_id is not None:
-            elem_id = str(action.target_node_id)
-            if elem_id in self._current_rects:
-                return f"computer.mouse.move_id('{elem_id}'); computer.mouse.{click_method}()"
-            else:
-                logger.warning(
-                    f"Element ID '{elem_id}' not found in rects, falling back to coordinates"
-                )
-
-        # Fallback: use coordinates if provided (less precise)
-        x = action.x if action.x is not None else 0
-        y = action.y if action.y is not None else 0
-
-        # Normalize coordinates
-        if isinstance(x, float) and 0 <= x <= 1:
-            x = x  # Keep normalized - move_abs handles it
-        if isinstance(y, float) and 0 <= y <= 1:
-            y = y  # Keep normalized
-
-        return f"computer.mouse.move_abs({x}, {y}); computer.mouse.{click_method}()"
-
-    def _translate_key_action(self, action: BenchmarkAction) -> str:
-        """Translate key press action using pyautogui (no grounding needed)."""
-        key = action.key or ""
-
-        # Map common key names to pyautogui names
-        key_map = {
-            "Enter": "enter",
-            "Return": "enter",
-            "Tab": "tab",
-            "Escape": "escape",
-            "Esc": "escape",
-            "Backspace": "backspace",
-            "Delete": "delete",
-            "Del": "delete",
-            "Space": "space",
-            "Up": "up",
-            "Down": "down",
-            "Left": "left",
-            "Right": "right",
-            "Home": "home",
-            "End": "end",
-            "PageUp": "pageup",
-            "PageDown": "pagedown",
-            "F1": "f1",
-            "F2": "f2",
-            "F3": "f3",
-            "F4": "f4",
-            "F5": "f5",
-            "F6": "f6",
-            "F7": "f7",
-            "F8": "f8",
-            "F9": "f9",
-            "F10": "f10",
-            "F11": "f11",
-            "F12": "f12",
-        }
-        key = key_map.get(key, key.lower())
-
-        # Handle modifiers with hotkey
-        if action.modifiers:
-            mods = [m.lower() for m in action.modifiers]
-            mod_map = {"control": "ctrl", "command": "win", "meta": "win"}
-            mods = [mod_map.get(m, m) for m in mods]
-            all_keys = mods + [key]
-            keys_str = ", ".join(f"'{k}'" for k in all_keys)
-            return f"import pyautogui; pyautogui.hotkey({keys_str})"
-
-        return f"import pyautogui; pyautogui.press('{key}')"
diff --git a/openadapt_ml/experiments/waa_demo/runner.py b/openadapt_ml/experiments/waa_demo/runner.py
index 77fa630..73c826d 100644
--- a/openadapt_ml/experiments/waa_demo/runner.py
+++ b/openadapt_ml/experiments/waa_demo/runner.py
@@ -38,7 +38,7 @@
 )
 
 if TYPE_CHECKING:
-    from openadapt_ml.benchmarks.base import (
+    from openadapt_evals import (
         BenchmarkAction,
         BenchmarkObservation,
         BenchmarkTask,
@@ -267,7 +267,7 @@ def act(
         Returns:
             BenchmarkAction parsed from VLM response
         """
-        from openadapt_ml.benchmarks.base import BenchmarkAction
+        from openadapt_evals import BenchmarkAction
 
         adapter = self._get_adapter()
 
@@ -409,7 +409,7 @@ def _parse_response(
         Uses the same parsing logic as APIBenchmarkAgent.
         """
         import re
-        from openadapt_ml.benchmarks.base import BenchmarkAction
+        from openadapt_evals import BenchmarkAction
 
         raw_action = {"response": response}
 
@@ -512,12 +512,10 @@ def cmd_run(args: argparse.Namespace) -> int:
     This integrates with the benchmarks infrastructure to run either
     zero-shot or demo-conditioned evaluation on WAA tasks.
     """
-    from openadapt_ml.benchmarks import (
+    from openadapt_evals import (
+        EvaluationConfig,
         WAAMockAdapter,
         compute_metrics,
-    )
-    from openadapt_ml.benchmarks.runner import (
-        EvaluationConfig,
         evaluate_agent_on_benchmark,
     )
 
diff --git a/pyproject.toml b/pyproject.toml
index a26039d..4910c68 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,6 +68,10 @@ dev = [
     "pytest>=9.0.0",
     "ruff>=0.1.0",
 ]
+# Benchmark evaluation (depends on openadapt-evals)
+benchmarks = [
+    "openadapt-evals>=0.1.1",
+]
 
 [project.urls]
 Homepage = "https://github.com/OpenAdaptAI/openadapt-ml"
diff --git a/tests/benchmarks/test_api_agent.py b/tests/benchmarks/test_api_agent.py
index d7732de..211d7bf 100644
--- a/tests/benchmarks/test_api_agent.py
+++ b/tests/benchmarks/test_api_agent.py
@@ -2,8 +2,8 @@
 
 import pytest
 
-from openadapt_ml.benchmarks import APIBenchmarkAgent, BenchmarkAction
-from openadapt_ml.benchmarks.base import BenchmarkObservation, BenchmarkTask
+from openadapt_evals import BenchmarkAction, BenchmarkObservation, BenchmarkTask
+from openadapt_ml.benchmarks import APIBenchmarkAgent
 
 
 class TestAPIBenchmarkAgentParsing:
diff --git a/tests/benchmarks/test_waa.py b/tests/benchmarks/test_waa.py
index cc1418c..9dc4cf9 100644
--- a/tests/benchmarks/test_waa.py
+++ b/tests/benchmarks/test_waa.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from openadapt_ml.benchmarks import (
+from openadapt_evals import (
     BenchmarkAction,
     BenchmarkObservation,
     BenchmarkResult,
@@ -35,8 +35,9 @@ def test_list_tasks_by_domain(self):
     def test_load_task(self):
         """Test loading a specific task."""
         adapter = WAAMockAdapter(num_tasks=5, domains=["browser"])
-        task = adapter.load_task("browser_1")
-        assert task.task_id == "browser_1"
+        # Mock adapter uses "mock_{domain}_{number:03d}" format
+        task = adapter.load_task("mock_browser_001")
+        assert task.task_id == "mock_browser_001"
         assert task.domain == "browser"
 
     def test_load_task_not_found(self):
@@ -124,16 +125,17 @@ def test_evaluate_with_random_agent(self):
 
     def test_evaluate_specific_tasks(self):
         """Test evaluating specific tasks."""
-        adapter = WAAMockAdapter(num_tasks=10)
+        adapter = WAAMockAdapter(num_tasks=10, domains=["browser", "notepad"])
         agent = RandomAgent(seed=42)
 
+        # Mock adapter uses "mock_{domain}_{number:03d}" format
         results = evaluate_agent_on_benchmark(
-            agent, adapter, task_ids=["browser_1", "browser_2"], max_steps=10
+            agent, adapter, task_ids=["mock_browser_001", "mock_browser_002"], max_steps=10
         )
 
         assert len(results) == 2
-        assert results[0].task_id == "browser_1"
-        assert results[1].task_id == "browser_2"
+        assert results[0].task_id == "mock_browser_001"
+        assert results[1].task_id == "mock_browser_002"
 
     def test_evaluate_with_scripted_agent(self):
         """Test running evaluation with ScriptedAgent."""
diff --git a/tests/integration/test_data_collection.py b/tests/integration/test_data_collection.py
index abd2412..159e1f2 100644
--- a/tests/integration/test_data_collection.py
+++ b/tests/integration/test_data_collection.py
@@ -30,9 +30,13 @@
 import logging
 from pathlib import Path
 
-from openadapt_ml.benchmarks.agent import RandomAgent
-from openadapt_ml.benchmarks.runner import EvaluationConfig, evaluate_agent_on_benchmark
-from openadapt_ml.benchmarks.waa import WAAMockAdapter
+# Import from openadapt-evals (canonical benchmark package)
+from openadapt_evals import (
+    EvaluationConfig,
+    RandomAgent,
+    WAAMockAdapter,
+    evaluate_agent_on_benchmark,
+)
 
 # Set up logging
 logging.basicConfig(
diff --git a/tests/integration/test_live_eval.py b/tests/integration/test_live_eval.py
index 448fc7a..68ac704 100644
--- a/tests/integration/test_live_eval.py
+++ b/tests/integration/test_live_eval.py
@@ -17,8 +17,12 @@
 import time
 from pathlib import Path
 
-from openadapt_ml.benchmarks import RandomAgent, WAAMockAdapter, evaluate_agent_on_benchmark
-from openadapt_ml.benchmarks.runner import EvaluationConfig
+from openadapt_evals import (
+    EvaluationConfig,
+    RandomAgent,
+    WAAMockAdapter,
+    evaluate_agent_on_benchmark,
+)
 
 
 def main():
diff --git a/tests/test_waa_live.py b/tests/test_waa_live.py
deleted file mode 100644
index 73786c7..0000000
--- a/tests/test_waa_live.py
+++ /dev/null
@@ -1,314 +0,0 @@
-"""Tests for WAALiveAdapter."""
-
-import pytest
-from unittest.mock import Mock, patch, MagicMock
-import json
-
-from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
-from openadapt_ml.benchmarks.base import BenchmarkAction, BenchmarkTask
-
-
-class TestWAALiveConfig:
-    """Tests for WAALiveConfig."""
-
-    def test_default_config(self):
-        """Test default configuration values."""
-        config = WAALiveConfig()
-        assert config.server_url == "http://localhost:5000"
-        assert config.a11y_backend == "uia"
-        assert config.screen_width == 1920
-        assert config.screen_height == 1200
-        assert config.max_steps == 15
-        assert config.action_delay == 0.5
-        assert config.timeout == 90.0
-
-    def test_custom_config(self):
-        """Test custom configuration."""
-        config = WAALiveConfig(
-            server_url="http://192.168.1.100:5000",
-            max_steps=20,
-            action_delay=1.0,
-        )
-        assert config.server_url == "http://192.168.1.100:5000"
-        assert config.max_steps == 20
-        assert config.action_delay == 1.0
-
-
-class TestWAALiveAdapter:
-    """Tests for WAALiveAdapter."""
-
-    def test_adapter_properties(self):
-        """Test adapter properties."""
-        adapter = WAALiveAdapter()
-        assert adapter.name == "waa-live"
-        assert adapter.benchmark_type == "interactive"
-        assert adapter.supports_parallel is False
-
-    @patch("openadapt_ml.benchmarks.waa_live.requests")
-    def test_check_connection_success(self, mock_requests):
-        """Test successful connection check."""
-        mock_requests.get.return_value = Mock(status_code=200)
-
-        adapter = WAALiveAdapter()
-        assert adapter.check_connection() is True
-
-        mock_requests.get.assert_called_once()
-
-    @patch("openadapt_ml.benchmarks.waa_live.requests.get")
-    def test_check_connection_failure(self, mock_get):
-        """Test failed connection check."""
-        import requests
-        mock_get.side_effect = requests.RequestException("Connection refused")
-
-        adapter = WAALiveAdapter()
-        assert adapter.check_connection() is False
-
-
-class TestActionTranslation:
-    """Tests for action translation.
-
-    The adapter uses element-based grounding via WAA's Computer class:
-    - Click actions use computer.mouse.move_id(id) for element grounding
-    - Keyboard actions use pyautogui (no grounding needed)
-    - Scroll uses computer.mouse.scroll()
-    """
-
-    def test_click_with_element_id(self):
-        """Test click with element ID uses move_id for grounding."""
-        adapter = WAALiveAdapter()
-        adapter._current_rects = {"5": [100, 200, 300, 400]}
-
-        action = BenchmarkAction(type="click", target_node_id="5")
-        command = adapter._translate_action(action)
-
-        assert "computer.mouse.move_id('5')" in command
-        assert "computer.mouse.single_click()" in command
-
-    def test_click_fallback_to_coords(self):
-        """Test click falls back to move_abs when no element ID."""
-        adapter = WAALiveAdapter()
-        adapter._current_rects = {}
-
-        action = BenchmarkAction(type="click", x=500, y=300)
-        command = adapter._translate_action(action)
-
-        assert "computer.mouse.move_abs(500, 300)" in command
-        assert "computer.mouse.single_click()" in command
-
-    def test_click_normalized_coords_fallback(self):
-        """Test click with normalized coordinates falls back to move_abs."""
-        adapter = WAALiveAdapter(WAALiveConfig(screen_width=1920, screen_height=1080))
-        adapter._current_rects = {}
-
-        action = BenchmarkAction(type="click", x=0.5, y=0.5)
-        command = adapter._translate_action(action)
-
-        # Normalized coords passed to move_abs (WAA handles conversion)
-        assert "computer.mouse.move_abs(0.5, 0.5)" in command
-        assert "computer.mouse.single_click()" in command
-
-    def test_double_click_with_element_id(self):
-        """Test double click with element ID."""
-        adapter = WAALiveAdapter()
-        adapter._current_rects = {"7": [0, 0, 100, 50]}
-
-        action = BenchmarkAction(type="double_click", target_node_id="7")
-        command = adapter._translate_action(action)
-
-        assert "computer.mouse.move_id('7')" in command
-        assert "computer.mouse.double_click()" in command
-
-    def test_type_action(self):
-        """Test type action uses pyautogui (no grounding needed)."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="type", text="Hello World")
-        command = adapter._translate_action(action)
-
-        assert "pyautogui.write('Hello World'" in command
-
-    def test_type_action_with_quotes(self):
-        """Test type action with quotes escaped."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="type", text="It's a \"test\"")
-        command = adapter._translate_action(action)
-
-        # Should escape single quotes
-        assert "\\'" in command
-
-    def test_key_action(self):
-        """Test key action uses pyautogui (no grounding needed)."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="key", key="Enter")
-        command = adapter._translate_action(action)
-
-        assert "pyautogui.press('enter')" in command
-
-    def test_key_action_with_modifiers(self):
-        """Test key action with modifiers."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="key", key="c", modifiers=["Control"])
-        command = adapter._translate_action(action)
-
-        assert "pyautogui.hotkey('ctrl', 'c')" in command
-
-    def test_scroll_action_down(self):
-        """Test scroll down uses computer.mouse.scroll."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="scroll", scroll_direction="down", scroll_amount=5)
-        command = adapter._translate_action(action)
-
-        assert "computer.mouse.scroll('down')" in command
-
-    def test_scroll_action_up(self):
-        """Test scroll up uses computer.mouse.scroll."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="scroll", scroll_direction="up", scroll_amount=3)
-        command = adapter._translate_action(action)
-
-        assert "computer.mouse.scroll('up')" in command
-
-    def test_done_action(self):
-        """Test done action returns None."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="done")
-        command = adapter._translate_action(action)
-
-        assert command is None
-
-    def test_wait_action(self):
-        """Test wait action."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="wait")
-        command = adapter._translate_action(action)
-
-        assert "time.sleep(1)" in command
-
-
-class TestRectExtraction:
-    """Tests for extracting element rects from a11y tree.
-
-    The adapter extracts element IDs and bboxes from the a11y tree
-    and sends them to WAA via /update_computer. WAA then handles
-    the actual grounding when computer.mouse.move_id(id) is called.
-    """
-
-    def test_extract_rects_simple(self):
-        """Test extracting rects from simple a11y tree."""
-        adapter = WAALiveAdapter()
-        a11y_tree = {
-            "id": "root",
-            "children": [
-                {
-                    "id": "5",
-                    "bbox": [100, 200, 300, 400],
-                }
-            ]
-        }
-
-        rects = adapter._extract_rects_from_a11y(a11y_tree)
-
-        assert "5" in rects
-        assert rects["5"] == [100, 200, 300, 400]
-
-    def test_extract_rects_nested(self):
-        """Test extracting rects from nested a11y tree."""
-        adapter = WAALiveAdapter()
-        a11y_tree = {
-            "id": "root",
-            "children": [
-                {
-                    "id": "1",
-                    "bbox": [0, 0, 500, 500],
-                    "children": [
-                        {
-                            "id": "3",
-                            "bbox": [50, 50, 150, 100],
-                        }
-                    ]
-                }
-            ]
-        }
-
-        rects = adapter._extract_rects_from_a11y(a11y_tree)
-
-        assert "root" in rects or "1" in rects  # Depends on if root has bbox
-        assert "1" in rects
-        assert "3" in rects
-        assert rects["3"] == [50, 50, 150, 100]
-
-    def test_extract_rects_empty_tree(self):
-        """Test extracting rects from empty a11y tree."""
-        adapter = WAALiveAdapter()
-
-        rects = adapter._extract_rects_from_a11y(None)
-        assert rects == {}
-
-        rects = adapter._extract_rects_from_a11y({})
-        assert rects == {}
-
-    def test_extract_rects_no_bbox(self):
-        """Test elements without bbox are skipped."""
-        adapter = WAALiveAdapter()
-        a11y_tree = {
-            "id": "root",
-            "children": [{"id": "5", "name": "Button"}]  # No bbox
-        }
-
-        rects = adapter._extract_rects_from_a11y(a11y_tree)
-
-        # Element without bbox should not be in rects
-        assert "5" not in rects
-
-    def test_click_element_not_in_rects_warns(self):
-        """Test click with unknown element ID logs warning and uses coords."""
-        adapter = WAALiveAdapter()
-        adapter._current_rects = {"1": [0, 0, 100, 100]}  # Element 7 not here
-
-        action = BenchmarkAction(
-            type="click",
-            target_node_id="7",  # Not in rects
-            x=999, y=999
-        )
-        command = adapter._translate_action(action)
-
-        # Should fall back to coordinate-based click
-        assert "move_abs" in command
-        assert "999" in command
-
-
-class TestObservationFetching:
-    """Tests for observation fetching."""
-
-    @patch("openadapt_ml.benchmarks.waa_live.requests")
-    def test_get_observation(self, mock_requests):
-        """Test fetching observation from server."""
-        # Mock screenshot response
-        screenshot_response = Mock()
-        screenshot_response.status_code = 200
-        screenshot_response.content = b"fake_png_data"
-
-        # Mock a11y response
-        a11y_response = Mock()
-        a11y_response.status_code = 200
-        a11y_response.json.return_value = {"AT": {"id": "root"}}
-
-        mock_requests.get.side_effect = [screenshot_response, a11y_response]
-
-        adapter = WAALiveAdapter()
-        obs = adapter._get_observation()
-
-        assert obs.screenshot == b"fake_png_data"
-        assert obs.accessibility_tree == {"id": "root"}
-        assert obs.viewport == (1920, 1200)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/uv.lock b/uv.lock
index 68193f3..9c77a77 100644
--- a/uv.lock
+++ b/uv.lock
@@ -996,7 +996,7 @@ name = "exceptiongroup"
 version = "1.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371 }
 wheels = [
@@ -2645,6 +2645,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c4/a8/f08d2fb482dc64443ae7208f9616b994b74b63f052737373f9fc32eb0ead/openadapt_capture-0.1.0-py3-none-any.whl", hash = "sha256:229e6d762dcfe22a34655853b5cf7c9eb08a61238cc79eefdffcf01f0c3dc860", size = 57921 },
 ]
 
+[[package]]
+name = "openadapt-evals"
+version = "0.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pillow" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/22/56/2e952caefce3755d82a0633b42ed9fbf4fbb8bfe45da5c693f1e2de50e7d/openadapt_evals-0.1.0.tar.gz", hash = "sha256:1015bc0fffba318e89f963bcf189979e6f8a4edf7a7b9f602339886be731ca6b", size = 256352 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/93/d76148490517ae02d417608f7906e2c34a63f68dcc1506c9387484112580/openadapt_evals-0.1.0-py3-none-any.whl", hash = "sha256:05a79c8598b41d90a5c5e33a114dc0aa9f0c81956b5d71f3ebd130d025d79dd0", size = 72408 },
+]
+
 [[package]]
 name = "openadapt-ml"
 version = "0.2.0"
@@ -2678,6 +2690,9 @@ azure = [
     { name = "azure-ai-ml" },
     { name = "azure-identity" },
 ]
+benchmarks = [
+    { name = "openadapt-evals" },
+]
 dev = [
     { name = "pytest" },
     { name = "ruff" },
@@ -2706,6 +2721,7 @@ requires-dist = [
     { name = "google-generativeai", specifier = ">=0.8.5" },
     { name = "matplotlib", specifier = ">=3.10.7" },
     { name = "openadapt-capture", specifier = ">=0.1.0" },
+    { name = "openadapt-evals", marker = "extra == 'benchmarks'", specifier = ">=0.1.0" },
     { name = "openai", marker = "extra == 'api'", specifier = ">=1.0.0" },
     { name = "peft", specifier = ">=0.18.0" },
     { name = "pillow", specifier = ">=12.0.0" },