diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index 7d6f6b8..e8a0589 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -1,11 +1,12 @@
 {"id":"openadapt-evals-0dt","title":"Add pre-flight check for Windows install issues","description":"Detect product key prompts or stuck installations BEFORE 10-minute timeout. Check container logs for specific error patterns.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:57:42.24338-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T18:57:42.24338-05:00"}
-{"id":"openadapt-evals-0ms","title":"Run 20-50 task evaluation","description":"Run WAA benchmark on 20-50 tasks to measure baseline success rate. Target is \u003e80% success rate. This provides quantitative data on agent performance.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T17:44:26.461765-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T17:44:26.461765-05:00","dependencies":[{"issue_id":"openadapt-evals-0ms","depends_on_id":"openadapt-evals-c3f","type":"blocks","created_at":"2026-01-20T17:44:26.462904-05:00","created_by":"Richard Abrich"}]}
+{"id":"openadapt-evals-0ms","title":"Run 20-50 task evaluation","description":"Run WAA benchmark on 20-50 tasks to measure baseline success rate. Target is \u003e80% success rate. This provides quantitative data on agent performance.","notes":"2026-01-29: Azure quota limits parallelization to 2 workers max (10 vCPUs / 4 vCPUs per worker). 10-worker test failed with ClusterCoreQuotaReached. User declined manual portal quota increase. Waiting for api-openai test results before full 154-task run.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T17:44:26.461765-05:00","created_by":"Richard Abrich","updated_at":"2026-01-28T20:16:32.776141-05:00","dependencies":[{"issue_id":"openadapt-evals-0ms","depends_on_id":"openadapt-evals-c3f","type":"blocks","created_at":"2026-01-20T17:44:26.462904-05:00","created_by":"Richard Abrich"}]}
 {"id":"openadapt-evals-2ar","title":"Implement permanent fix for Windows unattended install","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.544113-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.634857-05:00","closed_at":"2026-01-20T20:32:06.634857-05:00","close_reason":"Duplicate of openadapt-evals-b3l"}
 {"id":"openadapt-evals-5o8","title":"Analyze evaluation results","description":"Analyze WAA evaluation results to identify failure modes, success patterns, and improvement opportunities. Document findings and create actionable next steps.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T17:44:29.782932-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T17:44:29.782932-05:00","dependencies":[{"issue_id":"openadapt-evals-5o8","depends_on_id":"openadapt-evals-0ms","type":"blocks","created_at":"2026-01-20T17:44:29.783756-05:00","created_by":"Richard Abrich"}]}
-{"id":"openadapt-evals-b3l","title":"Implement permanent fix for Windows unattended install","description":"ROOT CAUSE FOUND: Using dev mode (UNC paths \\\\host.lan\\Data) instead of Azure mode (C:\\oem). Dev mode had UNC escaping bug in patch_xml.py. FIX: Simplified Dockerfile using vanilla WAA Azure mode approach - native OEM mechanism, no samba.sh patching, no custom FirstLogonCommands.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:57:42.092949-05:00","created_by":"Richard Abrich","updated_at":"2026-01-21T12:47:07.710012-05:00","comments":[{"id":7,"issue_id":"openadapt-evals-b3l","author":"Richard Abrich","text":"Jan 22: Confirmed issue recurred because we were booting from corrupted data.img created with dev mode. Fix: delete /data/waa-storage/* and let vanilla windowsarena/winarena create fresh install.","created_at":"2026-01-22T23:45:59Z"},{"id":8,"issue_id":"openadapt-evals-b3l","author":"Richard Abrich","text":"Jan 22 FIXED: Issues were (1) CLI storage path mismatch /mnt vs /data, (2) booting from corrupted data.img. Fix: standardized paths + deleted corrupted image. Fresh vanilla WAA install now at 18%+ and progressing.","created_at":"2026-01-22T23:56:59Z"}]}
-{"id":"openadapt-evals-c3f","title":"Complete WAA validation","description":"Validate that the WAA benchmark setup works end-to-end. Run a single task to confirm the infrastructure is operational before scaling up to full evaluation.","notes":"2026-01-22: Attempted end-to-end live smoke run on Azure VM.\n\n- Command: uv run python -m openadapt_evals.benchmarks.cli smoke-live --vm-name waa-eval-vm --resource-group OPENADAPT-AGENTS --task-id notepad_1\n- VM start + public IP succeeded (172.171.112.41)\n- Blocker: az vm run-command invoke timed out while running 'docker start winarena' (container start never returned)\n- Result: WAA server never became reachable on :5000; live eval could not connect\n- Cleanup: VM deallocated at end to stop spend\n\nNext: run remote docker diagnostics (docker ps -a, docker logs winarena, systemctl status docker, disk space) and fix underlying image/container hang (likely winarena pull/extract / docker stuck).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T17:44:18.817497-05:00","created_by":"Richard Abrich","updated_at":"2026-01-22T10:31:57.790605-05:00"}
+{"id":"openadapt-evals-5t1","title":"WAA 500 error root cause: Navi agent method signature mismatch","notes":"FILED: https://github.com/microsoft/WindowsAgentArena/issues/79","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-28T20:16:39.141187-05:00","created_by":"Richard Abrich","updated_at":"2026-01-28T20:29:38.780227-05:00","closed_at":"2026-01-28T20:29:38.780227-05:00","close_reason":"Issue filed upstream","labels":["bug","upstream","waa"]}
+{"id":"openadapt-evals-b3l","title":"Implement permanent fix for Windows unattended install","description":"ROOT CAUSE FOUND: Using dev mode (UNC paths \\\\host.lan\\Data) instead of Azure mode (C:\\oem). Dev mode had UNC escaping bug in patch_xml.py. FIX: Simplified Dockerfile using vanilla WAA Azure mode approach - native OEM mechanism, no samba.sh patching, no custom FirstLogonCommands.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:57:42.092949-05:00","created_by":"Richard Abrich","updated_at":"2026-01-21T12:47:07.710012-05:00","comments":[{"id":1,"issue_id":"openadapt-evals-b3l","author":"Richard Abrich","text":"Jan 22: Confirmed issue recurred because we were booting from corrupted data.img created with dev mode. Fix: delete /data/waa-storage/* and let vanilla windowsarena/winarena create fresh install.","created_at":"2026-01-22T23:45:59Z"},{"id":2,"issue_id":"openadapt-evals-b3l","author":"Richard Abrich","text":"Jan 22 FIXED: Issues were (1) CLI storage path mismatch /mnt vs /data, (2) booting from corrupted data.img. Fix: standardized paths + deleted corrupted image. Fresh vanilla WAA install now at 18%+ and progressing.","created_at":"2026-01-22T23:56:59Z"}]}
+{"id":"openadapt-evals-c3f","title":"Complete WAA validation","description":"Validate that the WAA benchmark setup works end-to-end. Run a single task to confirm the infrastructure is operational before scaling up to full evaluation.","notes":"2026-01-29: 500 error root cause identified - NOT QEMU version (10.0.6 is fine). Root cause is Navi agent method signature mismatch: computer.mouse.drag(x=, y=, x_end=) vs drag(screen_x, screen_y). Our api-openai/api-claude agents should avoid this since they use pyautogui directly. Testing with api-openai agent (agent a72af46 running).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T17:44:18.817497-05:00","created_by":"Richard Abrich","updated_at":"2026-01-28T20:16:32.593953-05:00"}
 {"id":"openadapt-evals-czj","title":"Docker installation fails on Azure VM - pkgProblemResolver error","description":"vm setup-waa fails to install Docker. Error: pkgProblemResolver::Resolve generated breaks. Need to investigate root cause before attempting fix.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T22:48:59.527637-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T22:48:59.527637-05:00"}
 {"id":"openadapt-evals-dke","title":"SYSTEM: Create knowledge persistence workflow using Beads","description":"Every fix/approach must be logged as a Beads issue with:\n1. Problem description\n2. Attempted solution\n3. Result (worked/failed/partial)\n4. Root cause if known\n5. Files changed\n\nBefore any fix attempt, agent MUST:\n1. Run 'bd list --labels=fix,approach' to see prior attempts\n2. Review what was tried before\n3. Document new attempt BEFORE implementing\n\nAfter context compaction, first action:\n1. Run 'bd ready' for current tasks\n2. Run 'bd list --labels=recurring' for known recurring issues\n3. Check docs/RECURRING_ISSUES.md for patterns","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T19:00:18.155796-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T19:00:18.155796-05:00"}
-{"id":"openadapt-evals-gna","title":"Test simplified Dockerfile (Azure mode)","description":"Testing Dockerfile.simplified which uses vanilla WAA Azure mode: native OEM mechanism (C:\\oem), InstallFrom element for unattended install, VERSION=11e for no product key. Steps: 1) Delete current VM 2) Create fresh VM 3) Build simplified image 4) Test Windows installation via QEMU screenshots","notes":"2026-01-22: Confirmed the blocker is not just docker pull; even starting the existing 'winarena' container via az vm run-command timed out.\n\n- smoke-live tried to run docker start winarena via run-command and timed out (900s)\n- WAA server remained unreachable at http://172.171.112.41:5000\n- VM was deallocated after the attempt\n\nImplication: VM/docker state is unhealthy or container start is hanging (possibly due to incomplete image extraction / stuck daemon / disk pressure).\nNext: add/run a vm-debug command to capture docker/system logs and determine whether to rebuild VM/image, pin/mirror image (ACR), or adjust docker config.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-21T12:47:15.12243-05:00","created_by":"Richard Abrich","updated_at":"2026-01-22T10:32:01.038825-05:00","labels":["testing","waa"],"comments":[{"id":1,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"Session Recovery 2026-01-22 17:58: Previous agents killed during compaction. VM state: Docker/containerd unhealthy, disk /mnt only 32GB (need 47GB+ for vanilla WAA). Git-lfs failing. User feedback: 1) use beads, 2) larger disk, 3) clean up CLI, 4) vanilla WAA config.","created_at":"2026-01-22T18:05:45Z"},{"id":2,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"Launched 3 parallel agents: ae159fc (VM disk upgrade), aabad47 (CLI cleanup), aee4e8a (fix containerd). Check /private/tmp/claude/-Users-abrichr-oa-src-openadapt-ml/tasks/*.output for results.","created_at":"2026-01-22T18:06:18Z"},{"id":3,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"WORKFLOW DOCUMENTED: VM config changes = delete VM -\u003e update code -\u003e relaunch. Added to CLAUDE.md. Default VM size now D8ds_v5 (300GB). Launching fresh VM now.","created_at":"2026-01-22T18:09:12Z"},{"id":4,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"2026-01-22 18:20: VM resources cleaned up, launched agent a9be1f8 to add auto-cleanup to CLI, WAA setup retrying in background (b04fcbe). Workflow documented in CLAUDE.md and STATUS.md.","created_at":"2026-01-22T18:11:56Z"},{"id":5,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"2026-01-22 18:30: VM created with D8s_v3 fallback (D8ds_v5 quota 0), IP 20.120.37.97. Restored waa_deploy symlink. Docker image building. W\u0026B integration agent a21c3ef running.","created_at":"2026-01-22T18:25:29Z"},{"id":6,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"2026-01-22 19:05: WAA Docker image built successfully! Container running. Windows booting. VM: 20.120.37.97, VNC: http://20.120.37.97:8006","created_at":"2026-01-22T18:47:03Z"}]}
+{"id":"openadapt-evals-gna","title":"Test simplified Dockerfile (Azure mode)","description":"Testing Dockerfile.simplified which uses vanilla WAA Azure mode: native OEM mechanism (C:\\oem), InstallFrom element for unattended install, VERSION=11e for no product key. Steps: 1) Delete current VM 2) Create fresh VM 3) Build simplified image 4) Test Windows installation via QEMU screenshots","notes":"2026-01-22: Confirmed the blocker is not just docker pull; even starting the existing 'winarena' container via az vm run-command timed out.\n\n- smoke-live tried to run docker start winarena via run-command and timed out (900s)\n- WAA server remained unreachable at http://172.171.112.41:5000\n- VM was deallocated after the attempt\n\nImplication: VM/docker state is unhealthy or container start is hanging (possibly due to incomplete image extraction / stuck daemon / disk pressure).\nNext: add/run a vm-debug command to capture docker/system logs and determine whether to rebuild VM/image, pin/mirror image (ACR), or adjust docker config.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-21T12:47:15.12243-05:00","created_by":"Richard Abrich","updated_at":"2026-01-22T10:32:01.038825-05:00","labels":["testing","waa"],"comments":[{"id":3,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"Session Recovery 2026-01-22 17:58: Previous agents killed during compaction. VM state: Docker/containerd unhealthy, disk /mnt only 32GB (need 47GB+ for vanilla WAA). Git-lfs failing. User feedback: 1) use beads, 2) larger disk, 3) clean up CLI, 4) vanilla WAA config.","created_at":"2026-01-22T18:05:45Z"},{"id":4,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"Launched 3 parallel agents: ae159fc (VM disk upgrade), aabad47 (CLI cleanup), aee4e8a (fix containerd). Check /private/tmp/claude/-Users-abrichr-oa-src-openadapt-ml/tasks/*.output for results.","created_at":"2026-01-22T18:06:18Z"},{"id":5,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"WORKFLOW DOCUMENTED: VM config changes = delete VM -\u003e update code -\u003e relaunch. Added to CLAUDE.md. Default VM size now D8ds_v5 (300GB). Launching fresh VM now.","created_at":"2026-01-22T18:09:12Z"},{"id":6,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"2026-01-22 18:20: VM resources cleaned up, launched agent a9be1f8 to add auto-cleanup to CLI, WAA setup retrying in background (b04fcbe). Workflow documented in CLAUDE.md and STATUS.md.","created_at":"2026-01-22T18:11:56Z"},{"id":7,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"2026-01-22 18:30: VM created with D8s_v3 fallback (D8ds_v5 quota 0), IP 20.120.37.97. Restored waa_deploy symlink. Docker image building. W\u0026B integration agent a21c3ef running.","created_at":"2026-01-22T18:25:29Z"},{"id":8,"issue_id":"openadapt-evals-gna","author":"Richard Abrich","text":"2026-01-22 19:05: WAA Docker image built successfully! Container running. Windows booting. VM: 20.120.37.97, VNC: http://20.120.37.97:8006","created_at":"2026-01-22T18:47:03Z"}]}
 {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
 {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
diff --git a/README.md b/README.md
index 4d074af..f2c8802 100644
--- a/README.md
+++ b/README.md
@@ -422,6 +422,46 @@ See [LIVE_MONITORING.md](./LIVE_MONITORING.md) for full documentation.
 - [CLAUDE.md](./CLAUDE.md) - Development guide and best practices
 - [CHANGELOG.md](./CHANGELOG.md) - Version history and changes
 
+## WAA Benchmark Results
+
+> **⚠️ PLACEHOLDER**: The results below are placeholders. Actual benchmark results will be added once the full evaluation completes.
+
+### Baseline Reproduction
+
+We run the full WAA benchmark using the same methodology as the original paper to establish baseline performance.
+
+**WAA Baseline Results (GPT-4o):**
+
+| Metric | Paper Reported | Our Reproduction | Status |
+|--------|----------------|------------------|--------|
+| Success Rate | ~19.5% | `[PLACEHOLDER]` | `[PENDING]` |
+| Tasks Evaluated | 154 | `[PLACEHOLDER]` | `[PENDING]` |
+| Avg Steps/Task | N/A | `[PLACEHOLDER]` | `[PENDING]` |
+| Avg Time/Task | N/A | `[PLACEHOLDER]` | `[PENDING]` |
+
+### Model Comparison
+
+Performance of different agents on WAA:
+
+| Agent | Success Rate | Avg Steps | Notes |
+|-------|--------------|-----------|-------|
+| GPT-4o (baseline) | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
+| Claude Sonnet 4.5 | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
+
+### Domain Breakdown
+
+Success rates by Windows application domain:
+
+| Domain | Tasks | Success Rate |
+|--------|-------|--------------|
+| Notepad | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| Chrome | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| File Explorer | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| Settings | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| ... | ... | ... |
+
+> **Note**: Full domain breakdown will be added when benchmark completes.
+
 ## License
 
 MIT
diff --git a/openadapt_evals/adapters/__init__.py b/openadapt_evals/adapters/__init__.py
index cd59373..519bea0 100644
--- a/openadapt_evals/adapters/__init__.py
+++ b/openadapt_evals/adapters/__init__.py
@@ -34,8 +34,16 @@
     StaticDatasetAdapter,
     UIElement,
 )
-from openadapt_evals.adapters.waa import WAAAdapter, WAAConfig, WAAMockAdapter
-from openadapt_evals.adapters.waa_live import WAALiveAdapter, WAALiveConfig
+from openadapt_evals.adapters.waa import (
+    WAAAdapter,
+    WAAConfig,
+    WAAMockAdapter,
+    WAALiveAdapter,
+    WAALiveConfig,
+    SyntheticTaskError,
+    is_real_waa_task_id,
+    is_synthetic_task_id,
+)
 
 __all__ = [
     # Base classes
@@ -52,4 +60,8 @@
     "WAAMockAdapter",
     "WAALiveAdapter",
     "WAALiveConfig",
+    # Task ID validation
+    "SyntheticTaskError",
+    "is_real_waa_task_id",
+    "is_synthetic_task_id",
 ]
diff --git a/openadapt_evals/adapters/waa/__init__.py b/openadapt_evals/adapters/waa/__init__.py
new file mode 100644
index 0000000..58b1e94
--- /dev/null
+++ b/openadapt_evals/adapters/waa/__init__.py
@@ -0,0 +1,51 @@
+"""Windows Agent Arena (WAA) adapters.
+
+This module provides adapters for the Windows Agent Arena benchmark:
+- WAAAdapter: Full WAA integration (requires WAA repo)
+- WAAMockAdapter: Mock adapter for testing (no Windows required)
+- WAALiveAdapter: HTTP adapter for remote WAA server
+
+Example:
+    ```python
+    from openadapt_evals.adapters.waa import WAAMockAdapter, WAALiveAdapter
+
+    # For local testing (no Windows VM)
+    adapter = WAAMockAdapter(num_tasks=10)
+
+    # For remote evaluation
+    adapter = WAALiveAdapter(server_url="http://vm-ip:5000")
+    ```
+"""
+
+from openadapt_evals.adapters.waa.mock import (
+    WAAAdapter,
+    WAAConfig,
+    WAAMockAdapter,
+    WAA_DOMAINS,
+)
+from openadapt_evals.adapters.waa.live import (
+    WAALiveAdapter,
+    WAALiveConfig,
+    SyntheticTaskError,
+    is_real_waa_task_id,
+    is_synthetic_task_id,
+    WAA_TASK_ID_PATTERN,
+    SYNTHETIC_TASK_PATTERNS,
+)
+
+__all__ = [
+    # Mock/full adapters
+    "WAAAdapter",
+    "WAAConfig",
+    "WAAMockAdapter",
+    "WAA_DOMAINS",
+    # Live adapter
+    "WAALiveAdapter",
+    "WAALiveConfig",
+    "WAA_TASK_ID_PATTERN",
+    "SYNTHETIC_TASK_PATTERNS",
+    # Task ID validation
+    "SyntheticTaskError",
+    "is_real_waa_task_id",
+    "is_synthetic_task_id",
+]
diff --git a/openadapt_evals/adapters/waa_live.py b/openadapt_evals/adapters/waa/live.py
similarity index 88%
rename from openadapt_evals/adapters/waa_live.py
rename to openadapt_evals/adapters/waa/live.py
index 0982e1a..34143dc 100644
--- a/openadapt_evals/adapters/waa_live.py
+++ b/openadapt_evals/adapters/waa/live.py
@@ -15,7 +15,7 @@
     not pixel coordinates. WAA's Computer class handles the grounding.
 
 Example:
-    from openadapt_evals.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
+    from openadapt_evals.adapters.waa import WAALiveAdapter, WAALiveConfig
 
     adapter = WAALiveAdapter(WAALiveConfig(server_url="http://vm-ip:5000"))
     agent = DemoConditionedAgent(base_agent, retriever)
@@ -26,6 +26,7 @@
 
 import base64
 import logging
+import re
 import time
 from dataclasses import dataclass
 from typing import Any
@@ -41,6 +42,70 @@
 logger = logging.getLogger(__name__)
 
 
+# WAA task IDs are UUIDs with a domain suffix, e.g., "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx-WOS"
+# Common suffixes: WOS (Windows OS), CHR (Chrome), NTP (Notepad), etc.
+WAA_TASK_ID_PATTERN = re.compile(
+    r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}(-[A-Za-z0-9]+)?$'
+)
+
+# Synthetic task ID patterns (from mock adapter or testing)
+SYNTHETIC_TASK_PATTERNS = [
+    re.compile(r'^(mock_)?[a-z_]+_\d+$'),  # notepad_1, mock_chrome_001
+    re.compile(r'^mock_'),  # any mock_ prefix
+]
+
+
+def is_real_waa_task_id(task_id: str) -> bool:
+    """Check if a task ID matches the real WAA UUID format.
+
+    Real WAA task IDs are UUIDs from test_small.json or test_all.json, e.g.:
+    - "a1b2c3d4-e5f6-7890-abcd-ef1234567890-WOS"
+    - "12345678-1234-1234-1234-123456789012-CHR"
+
+    Synthetic task IDs are simple patterns like:
+    - "notepad_1", "chrome_2" (from mock adapter)
+    - "mock_notepad_001" (explicit mock prefix)
+
+    Args:
+        task_id: Task identifier to check.
+
+    Returns:
+        True if the task ID appears to be a real WAA UUID.
+    """
+    return bool(WAA_TASK_ID_PATTERN.match(task_id))
+
+
+def is_synthetic_task_id(task_id: str) -> bool:
+    """Check if a task ID appears to be synthetic (for testing).
+
+    Args:
+        task_id: Task identifier to check.
+
+    Returns:
+        True if the task ID matches synthetic patterns.
+    """
+    for pattern in SYNTHETIC_TASK_PATTERNS:
+        if pattern.match(task_id):
+            return True
+    return False
+
+
+class SyntheticTaskError(ValueError):
+    """Raised when a synthetic task ID is used with the live adapter."""
+
+    def __init__(self, task_id: str):
+        self.task_id = task_id
+        super().__init__(
+            f"Task ID '{task_id}' appears to be synthetic (for testing). "
+            f"The live adapter requires real WAA task IDs (UUIDs from test_small.json or test_all.json). "
+            f"\n\nTo fix this:"
+            f"\n  1. Use --mock flag for testing without a Windows VM"
+            f"\n  2. Or provide real WAA task IDs with --task-ids"
+            f"\n  3. Or use --tasks N to select N random real tasks"
+            f"\n\nExample real task ID: 'a1b2c3d4-e5f6-7890-abcd-ef1234567890-WOS'"
+        )
+
+
 @dataclass
 class WAALiveConfig:
     """Configuration for WAALiveAdapter.
@@ -139,11 +204,20 @@ def load_task(self, task_id: str) -> BenchmarkTask:
         3. Creates minimal task as fallback
 
         Args:
-            task_id: Task identifier (e.g., "notepad_1", "browser_abc123").
+            task_id: Task identifier. Must be a real WAA UUID
+                (e.g., "a1b2c3d4-e5f6-7890-abcd-ef1234567890-WOS").
 
         Returns:
             BenchmarkTask object with evaluator config if available.
+
+        Raises:
+            SyntheticTaskError: If task_id appears to be synthetic (e.g., "notepad_1").
+                Use WAAMockAdapter for synthetic/testing tasks.
         """
+        # Validate that this is a real WAA task ID, not a synthetic one
+        if is_synthetic_task_id(task_id):
+            raise SyntheticTaskError(task_id)
+
         import requests
 
         # Try to load from server first
@@ -447,46 +521,45 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
             return self._evaluate_fallback(task)
 
     def _evaluate_fallback(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Fallback evaluation when /evaluate endpoint is unavailable.
+        """Fallback when proper evaluation unavailable - returns failure.
 
-        Uses a simple heuristic based on:
-        - Whether the agent took any actions
-        - Whether the agent called DONE
-        - Whether the task has success criteria we can check locally
+        This method explicitly fails instead of providing fake heuristic scores.
+        Proper evaluation requires either:
+        1. WAA server with /evaluate endpoint deployed
+        2. Task configs with evaluator specs (set waa_examples_path)
+        3. Real WAA task IDs (UUIDs from test_small.json/test_all.json)
 
         Args:
             task: Task to evaluate.
 
         Returns:
-            BenchmarkResult with heuristic-based score.
+            BenchmarkResult with success=False and score=0.0.
         """
-        has_actions = len(self._actions) > 0
-        called_done = any(a.type == "done" for a in self._actions)
-        typed_text = any(a.type == "type" and a.text for a in self._actions)
-
-        # Calculate heuristic score
-        score = 0.0
-        if has_actions:
-            score += 0.2
-        if called_done:
-            score += 0.2
-        if typed_text:
-            score += 0.1
-        if self._step_count >= 2:
-            score += 0.1
-
-        # Cap at 0.5 since we can't truly verify success
-        score = min(score, 0.5)
+        # Check if task has evaluator config
+        has_evaluator = bool(
+            task.raw_config and task.raw_config.get("evaluator")
+        )
+
+        if has_evaluator:
+            reason = (
+                "Evaluation unavailable: WAA /evaluate endpoint not deployed. "
+                "Task has evaluator config but server cannot run it."
+            )
+        else:
+            reason = (
+                "Evaluation unavailable: task config missing evaluator spec. "
+                "Set waa_examples_path in config or use real WAA task IDs "
+                "(UUIDs from test_small.json/test_all.json, not synthetic IDs like 'notepad_1')."
+            )
+
+        logger.error(reason)
 
         return BenchmarkResult(
             task_id=task.task_id,
-            success=False,  # Can't determine without proper evaluation
-            score=score,
+            success=False,
+            score=0.0,
             num_steps=self._step_count,
-            reason=(
-                "Fallback evaluation (WAA /evaluate endpoint unavailable). "
-                f"Heuristic: actions={len(self._actions)}, done={called_done}, typed={typed_text}"
-            ),
+            reason=reason,
         )
 
     def close(self) -> None:
diff --git a/openadapt_evals/adapters/waa.py b/openadapt_evals/adapters/waa/mock.py
similarity index 96%
rename from openadapt_evals/adapters/waa.py
rename to openadapt_evals/adapters/waa/mock.py
index 50b7477..550c2c0 100644
--- a/openadapt_evals/adapters/waa.py
+++ b/openadapt_evals/adapters/waa/mock.py
@@ -544,14 +544,24 @@ def _to_waa_action(self, action: BenchmarkAction) -> dict:
 class WAAMockAdapter(BenchmarkAdapter):
     """Mock WAA adapter for testing without Windows VM.
 
+    This adapter generates synthetic tasks for testing the benchmark infrastructure
+    without requiring a Windows VM or WAA server. Task IDs are prefixed with "mock_"
+    to clearly distinguish them from real WAA task IDs.
+
     Useful for:
     - Testing the benchmark integration without actual WAA
     - Development on non-Windows platforms
     - Unit tests
+    - Verifying agent behavior before running real evaluations
 
     Args:
         num_tasks: Number of mock tasks to generate.
         domains: Domains to include in mock tasks.
+
+    Note:
+        Mock task IDs use the format "mock_{domain}_{number}" (e.g., "mock_notepad_001").
+        These IDs are explicitly rejected by WAALiveAdapter to prevent confusion
+        between testing and real evaluation runs.
     """
 
     def __init__(
@@ -578,21 +588,27 @@ def benchmark_type(self) -> str:
         return "interactive"
 
     def _generate_mock_tasks(self) -> None:
-        """Generate mock tasks for testing."""
+        """Generate mock tasks for testing.
+
+        Task IDs use the format "mock_{domain}_{number}" (e.g., "mock_notepad_001")
+        to clearly distinguish them from real WAA UUIDs. This prevents accidental
+        use of synthetic tasks with the live adapter.
+        """
         tasks_per_domain = self._num_tasks // len(self._domains)
         extra = self._num_tasks % len(self._domains)
 
         for i, domain in enumerate(self._domains):
             count = tasks_per_domain + (1 if i < extra else 0)
             for j in range(count):
-                task_id = f"{domain}_{j + 1}"
+                # Use mock_ prefix to clearly indicate synthetic task
+                task_id = f"mock_{domain}_{j + 1:03d}"
                 self._tasks.append(
                     BenchmarkTask(
                         task_id=task_id,
                         instruction=f"Mock task {j + 1} in {domain} domain",
                         domain=domain,
                         time_limit_steps=15,
-                        raw_config={"mock": True},
+                        raw_config={"mock": True, "synthetic": True},
                     )
                 )
 
diff --git a/openadapt_evals/benchmarks/agent.py b/openadapt_evals/benchmarks/agent.py
deleted file mode 100644
index 6c7917e..0000000
--- a/openadapt_evals/benchmarks/agent.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""DEPRECATED: Import from openadapt_evals.agents instead.
-
-This module is kept for backward compatibility only.
-All classes are re-exported from openadapt_evals.agents.
-"""
-
-import warnings
-
-warnings.warn(
-    "openadapt_evals.benchmarks.agent is deprecated. "
-    "Please import from openadapt_evals.agents instead.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# Re-export from canonical location
-from openadapt_evals.agents import (
-    BenchmarkAgent,
-    RandomAgent,
-    ScriptedAgent,
-    SmartMockAgent,
-    ApiAgent,
-    action_to_string,
-    format_accessibility_tree,
-    parse_action_response,
-)
-
-__all__ = [
-    "BenchmarkAgent",
-    "RandomAgent",
-    "ScriptedAgent",
-    "SmartMockAgent",
-    "ApiAgent",
-    "action_to_string",
-    "format_accessibility_tree",
-    "parse_action_response",
-]
diff --git a/openadapt_evals/benchmarks/auto_screenshot.py b/openadapt_evals/benchmarks/auto_screenshot.py
deleted file mode 100644
index 132e764..0000000
--- a/openadapt_evals/benchmarks/auto_screenshot.py
+++ /dev/null
@@ -1,236 +0,0 @@
-"""Auto-screenshot tool for capturing benchmark viewer in multiple viewports.
-
-This module provides functionality to automatically capture screenshots of the
-benchmark viewer HTML in different viewport sizes (desktop, tablet, mobile) and
-different states (overview, details panel, log panel, etc.).
-
-Usage:
-    from openadapt_evals.benchmarks.auto_screenshot import generate_screenshots
-
-    generate_screenshots(
-        html_path="benchmark_results/viewer_demo/viewer.html",
-        output_dir="screenshots",
-        viewports=["desktop", "tablet", "mobile"],
-    )
-
-Requirements:
-    pip install playwright
-    playwright install chromium
-"""
-
-from __future__ import annotations
-
-import logging
-import subprocess
-import sys
-import time
-from pathlib import Path
-from typing import Any
-
-logger = logging.getLogger(__name__)
-
-# Viewport configurations
-VIEWPORTS = {
-    "desktop": {"width": 1920, "height": 1080},
-    "tablet": {"width": 768, "height": 1024},
-    "mobile": {"width": 375, "height": 667},
-}
-
-
-def ensure_playwright_installed() -> bool:
-    """Check if Playwright is installed and install if needed.
-
-    Returns:
-        True if Playwright is available, False otherwise.
-    """
-    try:
-        import playwright
-        return True
-    except ImportError:
-        logger.warning("Playwright not installed. Installing...")
-        try:
-            subprocess.run(
-                [sys.executable, "-m", "pip", "install", "playwright"],
-                check=True,
-                capture_output=True,
-            )
-            subprocess.run(
-                ["playwright", "install", "chromium"],
-                check=True,
-                capture_output=True,
-            )
-            logger.info("Playwright installed successfully")
-            return True
-        except subprocess.CalledProcessError as e:
-            logger.error(f"Failed to install Playwright: {e}")
-            return False
-
-
-def generate_screenshots(
-    html_path: str | Path,
-    output_dir: str | Path,
-    viewports: list[str] | None = None,
-    states: list[str] | None = None,
-) -> dict[str, list[Path]]:
-    """Generate screenshots of benchmark viewer in different viewports and states.
-
-    Args:
-        html_path: Path to the benchmark viewer HTML file.
-        output_dir: Directory to save screenshots.
-        viewports: List of viewport names to capture (default: all).
-        states: List of states to capture (default: all).
-            Options: "overview", "task_detail", "log_expanded", "log_collapsed"
-
-    Returns:
-        Dictionary mapping viewport names to lists of screenshot paths.
-    """
-    if not ensure_playwright_installed():
-        logger.error("Cannot generate screenshots without Playwright")
-        return {}
-
-    from playwright.sync_api import sync_playwright
-
-    html_path = Path(html_path)
-    output_dir = Path(output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    if viewports is None:
-        viewports = list(VIEWPORTS.keys())
-
-    if states is None:
-        states = ["overview", "task_detail", "log_expanded", "log_collapsed"]
-
-    screenshots: dict[str, list[Path]] = {vp: [] for vp in viewports}
-
-    with sync_playwright() as p:
-        browser = p.chromium.launch()
-
-        for viewport_name in viewports:
-            if viewport_name not in VIEWPORTS:
-                logger.warning(f"Unknown viewport: {viewport_name}, skipping")
-                continue
-
-            viewport = VIEWPORTS[viewport_name]
-            logger.info(f"Capturing {viewport_name} screenshots ({viewport['width']}x{viewport['height']})")
-
-            page = browser.new_page(viewport=viewport)
-
-            # Load the HTML file
-            page.goto(f"file://{html_path.absolute()}")
-
-            # Wait for page to load
-            page.wait_for_load_state("networkidle")
-            time.sleep(1)  # Extra wait for animations
-
-            # Capture overview state
-            if "overview" in states:
-                screenshot_path = output_dir / f"{viewport_name}_overview.png"
-                page.screenshot(path=str(screenshot_path), full_page=True)
-                screenshots[viewport_name].append(screenshot_path)
-                logger.info(f"  Saved: {screenshot_path}")
-
-            # Select first task to show task detail
-            try:
-                task_items = page.query_selector_all(".task-item")
-                if task_items and len(task_items) > 0:
-                    task_items[0].click()
-                    time.sleep(0.5)  # Wait for task detail to load
-
-                    # Capture task detail state
-                    if "task_detail" in states:
-                        screenshot_path = output_dir / f"{viewport_name}_task_detail.png"
-                        page.screenshot(path=str(screenshot_path), full_page=True)
-                        screenshots[viewport_name].append(screenshot_path)
-                        logger.info(f"  Saved: {screenshot_path}")
-
-                    # Expand log panel if it exists
-                    log_header = page.query_selector(".log-panel-header")
-                    if log_header and "log_expanded" in states:
-                        # Check if log panel is collapsed
-                        log_container = page.query_selector(".log-container")
-                        if log_container and "collapsed" in log_container.get_attribute("class"):
-                            log_header.click()
-                            time.sleep(0.3)
-
-                        screenshot_path = output_dir / f"{viewport_name}_log_expanded.png"
-                        page.screenshot(path=str(screenshot_path), full_page=True)
-                        screenshots[viewport_name].append(screenshot_path)
-                        logger.info(f"  Saved: {screenshot_path}")
-
-                    # Collapse log panel
-                    if log_header and "log_collapsed" in states:
-                        log_header.click()
-                        time.sleep(0.3)
-
-                        screenshot_path = output_dir / f"{viewport_name}_log_collapsed.png"
-                        page.screenshot(path=str(screenshot_path), full_page=True)
-                        screenshots[viewport_name].append(screenshot_path)
-                        logger.info(f"  Saved: {screenshot_path}")
-
-            except Exception as e:
-                logger.warning(f"Error capturing task detail states: {e}")
-
-            page.close()
-
-        browser.close()
-
-    logger.info(f"Generated {sum(len(paths) for paths in screenshots.values())} screenshots")
-    return screenshots
-
-
-def main():
-    """CLI entry point for auto-screenshot tool."""
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Generate screenshots of benchmark viewer"
-    )
-    parser.add_argument(
-        "--html-path",
-        required=True,
-        help="Path to benchmark viewer HTML file",
-    )
-    parser.add_argument(
-        "--output-dir",
-        default="screenshots",
-        help="Output directory for screenshots (default: screenshots)",
-    )
-    parser.add_argument(
-        "--viewports",
-        nargs="+",
-        choices=list(VIEWPORTS.keys()),
-        default=list(VIEWPORTS.keys()),
-        help="Viewports to capture (default: all)",
-    )
-    parser.add_argument(
-        "--states",
-        nargs="+",
-        choices=["overview", "task_detail", "log_expanded", "log_collapsed"],
-        default=["overview", "task_detail", "log_expanded", "log_collapsed"],
-        help="States to capture (default: all)",
-    )
-
-    args = parser.parse_args()
-
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s [%(levelname)s] %(message)s",
-        datefmt="%H:%M:%S",
-    )
-
-    screenshots = generate_screenshots(
-        html_path=args.html_path,
-        output_dir=args.output_dir,
-        viewports=args.viewports,
-        states=args.states,
-    )
-
-    print("\nGenerated screenshots:")
-    for viewport, paths in screenshots.items():
-        print(f"\n{viewport}:")
-        for path in paths:
-            print(f"  - {path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/openadapt_evals/benchmarks/azure.py b/openadapt_evals/benchmarks/azure.py
index 3835590..cfd35cc 100644
--- a/openadapt_evals/benchmarks/azure.py
+++ b/openadapt_evals/benchmarks/azure.py
@@ -80,6 +80,8 @@
 def classify_task_complexity(task: BenchmarkTask) -> str:
     """Classify task complexity to select appropriate VM tier.
 
+    Classification priority: complex > medium > simple > default(medium)
+
     Args:
         task: The benchmark task to classify.
 
@@ -90,13 +92,6 @@ def classify_task_complexity(task: BenchmarkTask) -> str:
     instruction = task.instruction.lower()
     domain = (task.domain or "").lower()
 
-    # Simple tasks: Notepad, File Explorer, basic Windows operations
-    simple_indicators = [
-        "notepad", "file explorer", "calculator", "paint",
-        "open", "close", "minimize", "maximize",
-        "create file", "delete file", "rename file",
-    ]
-
     # Complex tasks: Coding, debugging, multi-app workflows, data analysis
     complex_indicators = [
         "code", "debug", "compile", "ide", "visual studio",
@@ -104,9 +99,10 @@ def classify_task_complexity(task: BenchmarkTask) -> str:
         "excel formula", "pivot table", "macro",
         "multiple applications", "switch between",
         "data analysis", "chart", "graph",
+        "multitasking",
     ]
 
-    # Medium tasks: Browser, Office apps, email (everything else)
+    # Medium tasks: Browser, Office apps, email
     medium_indicators = [
         "browser", "chrome", "edge", "firefox",
         "word", "excel", "powerpoint", "office",
@@ -114,21 +110,34 @@ def classify_task_complexity(task: BenchmarkTask) -> str:
         "pdf", "acrobat",
     ]
 
+    # Simple tasks: Notepad, File Explorer, basic Windows operations
+    # Note: Check these AFTER medium to avoid "open" matching browser tasks
+    simple_indicators = [
+        "notepad", "file explorer", "file_explorer", "calculator", "paint",
+    ]
+
+    # Simple domains take precedence for direct domain matching
+    simple_domains = {"notepad", "calculator", "paint", "file_explorer"}
+
     # Check for complex indicators first
     for indicator in complex_indicators:
         if indicator in task_id or indicator in instruction or indicator in domain:
             return "complex"
 
-    # Check for simple indicators
-    for indicator in simple_indicators:
-        if indicator in task_id or indicator in instruction or indicator in domain:
-            return "simple"
-
-    # Check for medium indicators
+    # Check for medium indicators (browsers, office apps are more complex than notepad)
     for indicator in medium_indicators:
         if indicator in task_id or indicator in instruction or indicator in domain:
             return "medium"
 
+    # Check for simple domains (direct match)
+    if domain in simple_domains:
+        return "simple"
+
+    # Check for simple indicators in task_id or instruction
+    for indicator in simple_indicators:
+        if indicator in task_id or indicator in instruction:
+            return "simple"
+
     # Default to medium for unknown tasks
     return "medium"
 
@@ -879,7 +888,7 @@ def run_evaluation(
                 print("      No stale instances found.")
 
         # Load tasks
-        from openadapt_evals.benchmarks.waa import WAAAdapter
+        from openadapt_evals.adapters.waa import WAAAdapter
 
         adapter = WAAAdapter(waa_repo_path=self.waa_repo_path)
         if task_ids:
diff --git a/openadapt_evals/benchmarks/base.py b/openadapt_evals/benchmarks/base.py
deleted file mode 100644
index 4f6dd6b..0000000
--- a/openadapt_evals/benchmarks/base.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""DEPRECATED: Import from openadapt_evals.adapters instead.
-
-This module is kept for backward compatibility only.
-All classes are re-exported from openadapt_evals.adapters.base.
-"""
-
-import warnings
-
-warnings.warn(
-    "openadapt_evals.benchmarks.base is deprecated. "
-    "Please import from openadapt_evals.adapters instead.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# Re-export from canonical location
-from openadapt_evals.adapters.base import (
-    BenchmarkAction,
-    BenchmarkAdapter,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-    StaticDatasetAdapter,
-    UIElement,
-)
-
-__all__ = [
-    "BenchmarkAction",
-    "BenchmarkAdapter",
-    "BenchmarkObservation",
-    "BenchmarkResult",
-    "BenchmarkTask",
-    "StaticDatasetAdapter",
-    "UIElement",
-]
diff --git a/openadapt_evals/benchmarks/config.py b/openadapt_evals/benchmarks/config.py
new file mode 100644
index 0000000..2650e7d
--- /dev/null
+++ b/openadapt_evals/benchmarks/config.py
@@ -0,0 +1,261 @@
+"""Benchmark configuration management.
+
+This module provides configuration loading and storage for WAA benchmarks.
+Configuration can come from:
+1. ~/.openadapt/benchmark_config.json (user config)
+2. Environment variables
+3. Auto-detection of common paths
+
+Usage:
+    from openadapt_evals.benchmarks.config import get_config, BenchmarkConfig
+
+    config = get_config()
+    print(config.waa_examples_path)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# Default config file location
+CONFIG_DIR = Path.home() / ".openadapt"
+CONFIG_FILE = CONFIG_DIR / "benchmark_config.json"
+
+
+@dataclass
+class BenchmarkConfig:
+    """Configuration for WAA benchmark evaluation.
+
+    Attributes:
+        waa_examples_path: Path to WAA evaluation_examples_windows directory.
+            Contains task configs with evaluator specs.
+        default_agent: Default agent type for evaluation (e.g., "api-openai").
+        server_url: Default WAA server URL.
+        default_task_list: Which task list to use ("test_small", "test_all", "test_custom").
+    """
+
+    waa_examples_path: str | None = None
+    default_agent: str = "api-openai"
+    server_url: str = "http://localhost:5000"
+    default_task_list: str = "test_small"
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "waa_examples_path": self.waa_examples_path,
+            "default_agent": self.default_agent,
+            "server_url": self.server_url,
+            "default_task_list": self.default_task_list,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "BenchmarkConfig":
+        """Create from dictionary."""
+        return cls(
+            waa_examples_path=data.get("waa_examples_path"),
+            default_agent=data.get("default_agent", "api-openai"),
+            server_url=data.get("server_url", "http://localhost:5000"),
+            default_task_list=data.get("default_task_list", "test_small"),
+        )
+
+
+def _find_waa_examples() -> str | None:
+    """Auto-detect WAA examples directory.
+
+    Searches common locations for the WAA evaluation_examples_windows directory.
+
+    Returns:
+        Path to examples directory if found, None otherwise.
+    """
+    # Common relative paths from various working directories
+    candidates = [
+        # From openadapt-evals
+        Path("../openadapt-ml/vendor/WindowsAgentArena/src/win-arena-container/client/evaluation_examples_windows"),
+        # From openadapt-ml
+        Path("vendor/WindowsAgentArena/src/win-arena-container/client/evaluation_examples_windows"),
+        # Absolute path in user's common locations
+        Path.home() / "oa/src/openadapt-ml/vendor/WindowsAgentArena/src/win-arena-container/client/evaluation_examples_windows",
+        # Environment variable
+    ]
+
+    # Check WAA_EXAMPLES_PATH environment variable
+    env_path = os.environ.get("WAA_EXAMPLES_PATH")
+    if env_path:
+        candidates.insert(0, Path(env_path))
+
+    for path in candidates:
+        resolved = path.resolve()
+        if resolved.exists() and (resolved / "test_small.json").exists():
+            logger.info(f"Auto-detected WAA examples at: {resolved}")
+            return str(resolved)
+
+    return None
+
+
+def load_config() -> BenchmarkConfig:
+    """Load configuration from file and environment.
+
+    Priority (highest to lowest):
+    1. Environment variables (WAA_EXAMPLES_PATH, etc.)
+    2. Config file (~/.openadapt/benchmark_config.json)
+    3. Auto-detection
+    4. Defaults
+
+    Returns:
+        BenchmarkConfig instance.
+    """
+    config = BenchmarkConfig()
+
+    # Load from config file if it exists
+    if CONFIG_FILE.exists():
+        try:
+            with open(CONFIG_FILE, encoding="utf-8") as f:
+                data = json.load(f)
+            config = BenchmarkConfig.from_dict(data)
+            logger.info(f"Loaded config from {CONFIG_FILE}")
+        except Exception as e:
+            logger.warning(f"Failed to load config from {CONFIG_FILE}: {e}")
+
+    # Override with environment variables
+    if os.environ.get("WAA_EXAMPLES_PATH"):
+        config.waa_examples_path = os.environ["WAA_EXAMPLES_PATH"]
+    if os.environ.get("WAA_SERVER_URL"):
+        config.server_url = os.environ["WAA_SERVER_URL"]
+    if os.environ.get("WAA_DEFAULT_AGENT"):
+        config.default_agent = os.environ["WAA_DEFAULT_AGENT"]
+
+    # Auto-detect waa_examples_path if not set
+    if not config.waa_examples_path:
+        config.waa_examples_path = _find_waa_examples()
+
+    return config
+
+
+def save_config(config: BenchmarkConfig) -> None:
+    """Save configuration to file.
+
+    Args:
+        config: Configuration to save.
+    """
+    CONFIG_DIR.mkdir(parents=True, exist_ok=True)
+
+    with open(CONFIG_FILE, "w", encoding="utf-8") as f:
+        json.dump(config.to_dict(), f, indent=2)
+
+    logger.info(f"Saved config to {CONFIG_FILE}")
+
+
+# Global config instance (lazy loaded)
+_config: BenchmarkConfig | None = None
+
+
+def get_config() -> BenchmarkConfig:
+    """Get the global benchmark configuration.
+
+    Loads configuration on first call and caches it.
+
+    Returns:
+        BenchmarkConfig instance.
+    """
+    global _config
+    if _config is None:
+        _config = load_config()
+    return _config
+
+
+def reset_config() -> None:
+    """Reset the global config (for testing)."""
+    global _config
+    _config = None
+
+
+def load_task_list(
+    examples_path: str,
+    task_list: str = "test_small",
+) -> dict[str, list[str]]:
+    """Load task list from WAA examples directory.
+
+    Args:
+        examples_path: Path to WAA evaluation_examples_windows directory.
+        task_list: Which task list to load ("test_small", "test_all", "test_custom").
+
+    Returns:
+        Dict mapping domain -> list of task IDs.
+
+    Raises:
+        FileNotFoundError: If task list file not found.
+    """
+    task_file = Path(examples_path) / f"{task_list}.json"
+    if not task_file.exists():
+        raise FileNotFoundError(f"Task list not found: {task_file}")
+
+    with open(task_file, encoding="utf-8") as f:
+        return json.load(f)
+
+
+def get_all_task_ids(
+    examples_path: str,
+    task_list: str = "test_small",
+    domains: list[str] | None = None,
+) -> list[tuple[str, str]]:
+    """Get all task IDs with their domains.
+
+    Args:
+        examples_path: Path to WAA evaluation_examples_windows directory.
+        task_list: Which task list to use.
+        domains: Filter to specific domains (None = all domains).
+
+    Returns:
+        List of (domain, task_id) tuples.
+    """
+    tasks = load_task_list(examples_path, task_list)
+
+    result = []
+    for domain, task_ids in tasks.items():
+        if domains is None or domain in domains:
+            for task_id in task_ids:
+                result.append((domain, task_id))
+
+    return result
+
+
+def load_task_config(
+    examples_path: str,
+    domain: str,
+    task_id: str,
+) -> dict[str, Any]:
+    """Load a specific task's configuration.
+
+    Args:
+        examples_path: Path to WAA evaluation_examples_windows directory.
+        domain: Task domain (e.g., "chrome", "notepad").
+        task_id: Task ID (e.g., "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3-wos").
+
+    Returns:
+        Task configuration dict with evaluator spec.
+
+    Raises:
+        FileNotFoundError: If task config not found.
+    """
+    # Try different path formats
+    candidates = [
+        Path(examples_path) / "examples" / domain / f"{task_id}.json",
+        Path(examples_path) / domain / f"{task_id}.json",
+    ]
+
+    for task_file in candidates:
+        if task_file.exists():
+            with open(task_file, encoding="utf-8") as f:
+                return json.load(f)
+
+    raise FileNotFoundError(
+        f"Task config not found for {domain}/{task_id}. "
+        f"Tried: {[str(c) for c in candidates]}"
+    )
diff --git a/openadapt_evals/benchmarks/dashboard_server.py b/openadapt_evals/benchmarks/dashboard_server.py
deleted file mode 100644
index a28baf8..0000000
--- a/openadapt_evals/benchmarks/dashboard_server.py
+++ /dev/null
@@ -1,944 +0,0 @@
-"""Auto-launching Azure resource monitoring dashboard.
-
-This module provides a real-time web dashboard that automatically displays:
-- Active Azure resources (VMs, containers, compute instances)
-- Real-time costs with breakdown by resource type
-- Live activity from WAA evaluations (screenshots, actions, task progress)
-- Resource utilization (CPU, memory, disk)
-- Logs from vm-setup, evaluations, etc.
-- Controls to stop/start expensive resources
-
-The dashboard automatically launches in the browser when Azure resources are started
-and persists across multiple command invocations.
-
-Usage:
-    # Auto-launch when starting resources
-    from openadapt_evals.benchmarks.dashboard_server import ensure_dashboard_running
-
-    ensure_dashboard_running()  # Starts server if not running, opens browser
-
-    # Or run standalone
-    python -m openadapt_evals.benchmarks.dashboard_server
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import logging
-import os
-import subprocess
-import sys
-import threading
-import time
-import webbrowser
-from dataclasses import asdict, dataclass
-from datetime import datetime, timedelta, timezone
-from pathlib import Path
-from typing import Any
-
-from flask import Flask, jsonify, render_template_string, request
-from flask_cors import CORS
-
-logger = logging.getLogger(__name__)
-
-# Global dashboard state
-_dashboard_server_thread: threading.Thread | None = None
-_dashboard_port: int = 5555
-_dashboard_url: str = f"http://127.0.0.1:{_dashboard_port}"
-
-
-@dataclass
-class ResourceInfo:
-    """Information about an Azure resource."""
-
-    resource_type: str  # "vm", "compute", "container"
-    name: str
-    status: str
-    cost_per_hour: float
-    location: str
-    size: str | None = None
-    public_ip: str | None = None
-    created_time: str | None = None
-    uptime_hours: float = 0.0
-
-
-@dataclass
-class CostBreakdown:
-    """Cost breakdown by resource type."""
-
-    compute_per_hour: float = 0.0
-    storage_per_hour: float = 0.0
-    network_per_hour: float = 0.0
-    total_per_hour: float = 0.0
-    total_today: float = 0.0
-    total_this_week: float = 0.0
-    total_this_month: float = 0.0
-
-
-@dataclass
-class ActivityInfo:
-    """Live activity information."""
-
-    current_task: str | None = None
-    task_progress: str | None = None  # "5/154 tasks completed"
-    latest_screenshot: str | None = None  # base64 or URL
-    action_count: int = 0
-    latest_actions: list[str] | None = None
-    logs: list[str] | None = None
-
-
-class DashboardState:
-    """Maintains current state of Azure resources and costs."""
-
-    def __init__(self):
-        self.resources: list[ResourceInfo] = []
-        self.costs = CostBreakdown()
-        self.activity = ActivityInfo()
-        self.last_updated = datetime.now(timezone.utc)
-        self._lock = threading.Lock()
-
-    def update_resources(self, resources: list[ResourceInfo]) -> None:
-        """Update resource list."""
-        with self._lock:
-            self.resources = resources
-            self._update_costs()
-            self.last_updated = datetime.now(timezone.utc)
-
-    def update_activity(self, activity: ActivityInfo) -> None:
-        """Update activity information."""
-        with self._lock:
-            self.activity = activity
-            self.last_updated = datetime.now(timezone.utc)
-
-    def _update_costs(self) -> None:
-        """Calculate total costs from resources."""
-        compute_cost = sum(r.cost_per_hour for r in self.resources if r.status == "running")
-
-        # Estimate storage/network (usually much smaller than compute)
-        storage_cost = len(self.resources) * 0.01  # ~$0.01/hour per resource
-        network_cost = 0.05 if self.resources else 0.0  # Fixed small amount
-
-        self.costs = CostBreakdown(
-            compute_per_hour=compute_cost,
-            storage_per_hour=storage_cost,
-            network_per_hour=network_cost,
-            total_per_hour=compute_cost + storage_cost + network_cost,
-            total_today=compute_cost * 24,  # Rough estimate
-            total_this_week=compute_cost * 24 * 7,
-            total_this_month=compute_cost * 720,  # 30 days
-        )
-
-    def to_dict(self) -> dict[str, Any]:
-        """Convert to dictionary for JSON serialization."""
-        with self._lock:
-            return {
-                "resources": [asdict(r) for r in self.resources],
-                "costs": asdict(self.costs),
-                "activity": asdict(self.activity),
-                "last_updated": self.last_updated.isoformat(),
-            }
-
-
-# Global state
-dashboard_state = DashboardState()
-
-
-def get_azure_resources() -> list[ResourceInfo]:
-    """Query Azure for currently running resources.
-
-    Returns:
-        List of active Azure resources with cost information.
-    """
-    resources = []
-
-    try:
-        # Get VMs
-        result = subprocess.run(
-            ["az", "vm", "list", "--show-details", "--query",
-             "[].{name:name, status:powerState, size:hardwareProfile.vmSize, "
-             "location:location, rg:resourceGroup, publicIps:publicIps}", "-o", "json"],
-            capture_output=True,
-            text=True,
-            timeout=30,
-        )
-
-        if result.returncode == 0:
-            vms = json.loads(result.stdout)
-            for vm in vms:
-                # Estimate cost based on VM size
-                cost = estimate_vm_cost(vm.get("size", "Unknown"))
-
-                resources.append(ResourceInfo(
-                    resource_type="vm",
-                    name=vm.get("name", "Unknown"),
-                    status=vm.get("status", "Unknown"),
-                    cost_per_hour=cost,
-                    location=vm.get("location", "Unknown"),
-                    size=vm.get("size"),
-                    public_ip=vm.get("publicIps"),
-                ))
-    except Exception as e:
-        logger.warning(f"Failed to query Azure VMs: {e}")
-
-    try:
-        # Get Azure ML compute instances
-        # This requires resource group and workspace name from env
-        rg = os.getenv("AZURE_ML_RESOURCE_GROUP", "openadapt-agents")
-        ws = os.getenv("AZURE_ML_WORKSPACE_NAME", "openadapt-ml")
-
-        result = subprocess.run(
-            ["az", "ml", "compute", "list",
-             "--resource-group", rg,
-             "--workspace-name", ws,
-             "--query", "[].{name:name, status:state, size:size, created:created_on}",
-             "-o", "json"],
-            capture_output=True,
-            text=True,
-            timeout=30,
-        )
-
-        if result.returncode == 0:
-            computes = json.loads(result.stdout)
-            for compute in computes:
-                cost = estimate_vm_cost(compute.get("size", "Unknown"))
-
-                resources.append(ResourceInfo(
-                    resource_type="compute",
-                    name=compute.get("name", "Unknown"),
-                    status=compute.get("status", "Unknown"),
-                    cost_per_hour=cost,
-                    location="azure-ml",
-                    size=compute.get("size"),
-                    created_time=compute.get("created"),
-                ))
-    except Exception as e:
-        logger.warning(f"Failed to query Azure ML compute: {e}")
-
-    return resources
-
-
-def estimate_vm_cost(vm_size: str) -> float:
-    """Estimate hourly cost for a VM size.
-
-    Args:
-        vm_size: Azure VM size (e.g., "Standard_D4_v3").
-
-    Returns:
-        Estimated hourly cost in USD.
-    """
-    # Map common VM sizes to costs (East US pricing)
-    cost_map = {
-        "Standard_D2_v3": 0.096,
-        "Standard_D4_v3": 0.192,
-        "Standard_D8_v3": 0.384,
-        "Standard_D2s_v3": 0.096,
-        "Standard_D4s_v3": 0.192,
-        "Standard_D8s_v3": 0.384,
-        "Standard_D4ds_v5": 0.20,
-        "Standard_D4s_v5": 0.192,
-    }
-
-    return cost_map.get(vm_size, 0.20)  # Default to $0.20/hour
-
-
-def get_live_activity() -> ActivityInfo:
-    """Get current live activity from evaluation tracking.
-
-    Returns:
-        Current activity information.
-    """
-    activity = ActivityInfo()
-
-    # Try to load live tracking file
-    live_file = Path("benchmark_live.json")
-    if live_file.exists():
-        try:
-            with open(live_file) as f:
-                data = json.load(f)
-
-            if data.get("status") == "running":
-                current = data.get("current_task", {})
-                activity.current_task = current.get("instruction", "Unknown task")
-
-                total = data.get("total_tasks", 0)
-                completed = data.get("tasks_completed", 0)
-                activity.task_progress = f"{completed}/{total} tasks completed"
-
-                # Get recent actions
-                steps = current.get("steps", [])
-                if steps:
-                    activity.action_count = len(steps)
-                    activity.latest_actions = [
-                        f"Step {s['step_idx']}: {s['action']['type']}"
-                        for s in steps[-5:]  # Last 5 actions
-                    ]
-        except Exception as e:
-            logger.warning(f"Failed to read live tracking file: {e}")
-
-    # Try to load recent logs
-    try:
-        log_files = sorted(Path(".").glob("*.log"), key=lambda p: p.stat().st_mtime, reverse=True)
-        if log_files:
-            with open(log_files[0]) as f:
-                lines = f.readlines()
-                activity.logs = [line.strip() for line in lines[-10:]]  # Last 10 lines
-    except Exception as e:
-        logger.warning(f"Failed to read log files: {e}")
-
-    return activity
-
-
-# HTML template for dashboard
-DASHBOARD_HTML = """
-<!DOCTYPE html>
-<html>
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Azure Resource Dashboard</title>
-    <style>
-        * {
-            margin: 0;
-            padding: 0;
-            box-sizing: border-box;
-        }
-
-        body {
-            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
-            background: #0f0f0f;
-            color: #e0e0e0;
-            padding: 20px;
-        }
-
-        .container {
-            max-width: 1400px;
-            margin: 0 auto;
-        }
-
-        header {
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            padding: 30px;
-            border-radius: 10px;
-            margin-bottom: 30px;
-            box-shadow: 0 4px 6px rgba(0,0,0,0.3);
-        }
-
-        h1 {
-            font-size: 32px;
-            margin-bottom: 10px;
-        }
-
-        .subtitle {
-            opacity: 0.9;
-            font-size: 14px;
-        }
-
-        .grid {
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
-            gap: 20px;
-            margin-bottom: 30px;
-        }
-
-        .card {
-            background: #1a1a1a;
-            border-radius: 10px;
-            padding: 25px;
-            box-shadow: 0 2px 4px rgba(0,0,0,0.3);
-            border: 1px solid #333;
-        }
-
-        .card h2 {
-            font-size: 18px;
-            margin-bottom: 15px;
-            color: #667eea;
-        }
-
-        .stat {
-            display: flex;
-            justify-content: space-between;
-            align-items: center;
-            padding: 10px 0;
-            border-bottom: 1px solid #333;
-        }
-
-        .stat:last-child {
-            border-bottom: none;
-        }
-
-        .stat-label {
-            color: #999;
-            font-size: 14px;
-        }
-
-        .stat-value {
-            font-size: 20px;
-            font-weight: 600;
-        }
-
-        .cost-value {
-            color: #f59e0b;
-        }
-
-        .status-running {
-            color: #10b981;
-        }
-
-        .status-stopped {
-            color: #6b7280;
-        }
-
-        .status-failed {
-            color: #ef4444;
-        }
-
-        .resource-item {
-            background: #252525;
-            padding: 15px;
-            border-radius: 8px;
-            margin-bottom: 10px;
-            border-left: 4px solid #667eea;
-        }
-
-        .resource-header {
-            display: flex;
-            justify-content: space-between;
-            align-items: center;
-            margin-bottom: 8px;
-        }
-
-        .resource-name {
-            font-weight: 600;
-            font-size: 16px;
-        }
-
-        .resource-meta {
-            font-size: 12px;
-            color: #999;
-            margin-top: 5px;
-        }
-
-        .action-log {
-            background: #1a1a1a;
-            border: 1px solid #333;
-            border-radius: 5px;
-            padding: 10px;
-            margin: 5px 0;
-            font-family: 'Courier New', monospace;
-            font-size: 12px;
-            color: #a0a0a0;
-        }
-
-        .btn {
-            background: #667eea;
-            color: white;
-            border: none;
-            padding: 8px 16px;
-            border-radius: 5px;
-            cursor: pointer;
-            font-size: 14px;
-            transition: background 0.2s;
-        }
-
-        .btn:hover {
-            background: #5568d3;
-        }
-
-        .btn-danger {
-            background: #ef4444;
-        }
-
-        .btn-danger:hover {
-            background: #dc2626;
-        }
-
-        .btn-success {
-            background: #10b981;
-        }
-
-        .btn-success:hover {
-            background: #059669;
-        }
-
-        .alert {
-            background: #7c2d12;
-            border-left: 4px solid #ef4444;
-            padding: 15px;
-            border-radius: 5px;
-            margin-bottom: 20px;
-        }
-
-        .alert-warning {
-            background: #78350f;
-            border-left-color: #f59e0b;
-        }
-
-        .refresh-info {
-            text-align: center;
-            color: #666;
-            font-size: 12px;
-            margin-top: 20px;
-        }
-
-        .loading {
-            text-align: center;
-            padding: 40px;
-            color: #666;
-        }
-
-        .empty-state {
-            text-align: center;
-            padding: 40px;
-            color: #666;
-        }
-    </style>
-</head>
-<body>
-    <div class="container">
-        <header>
-            <h1>Azure Resource Dashboard</h1>
-            <div class="subtitle">Real-time monitoring of Azure resources and costs</div>
-        </header>
-
-        <div id="cost-alert" style="display: none;" class="alert alert-warning">
-            <strong>High Cost Alert:</strong> Your resources are costing over $5/hour.
-            Consider stopping unused VMs to reduce costs.
-        </div>
-
-        <div class="grid">
-            <!-- Cost Summary -->
-            <div class="card">
-                <h2>Cost Summary</h2>
-                <div class="stat">
-                    <span class="stat-label">Per Hour</span>
-                    <span class="stat-value cost-value" id="cost-hourly">$0.00</span>
-                </div>
-                <div class="stat">
-                    <span class="stat-label">Today (est.)</span>
-                    <span class="stat-value cost-value" id="cost-daily">$0.00</span>
-                </div>
-                <div class="stat">
-                    <span class="stat-label">This Week (est.)</span>
-                    <span class="stat-value cost-value" id="cost-weekly">$0.00</span>
-                </div>
-                <div class="stat">
-                    <span class="stat-label">This Month (est.)</span>
-                    <span class="stat-value cost-value" id="cost-monthly">$0.00</span>
-                </div>
-            </div>
-
-            <!-- Resource Count -->
-            <div class="card">
-                <h2>Active Resources</h2>
-                <div class="stat">
-                    <span class="stat-label">Running VMs</span>
-                    <span class="stat-value status-running" id="count-vms">0</span>
-                </div>
-                <div class="stat">
-                    <span class="stat-label">Compute Instances</span>
-                    <span class="stat-value status-running" id="count-compute">0</span>
-                </div>
-                <div class="stat">
-                    <span class="stat-label">Total Resources</span>
-                    <span class="stat-value" id="count-total">0</span>
-                </div>
-            </div>
-
-            <!-- Current Activity -->
-            <div class="card">
-                <h2>Current Activity</h2>
-                <div class="stat">
-                    <span class="stat-label">Task</span>
-                    <span class="stat-value" id="current-task" style="font-size: 14px;">Idle</span>
-                </div>
-                <div class="stat">
-                    <span class="stat-label">Progress</span>
-                    <span class="stat-value" id="task-progress">-</span>
-                </div>
-                <div class="stat">
-                    <span class="stat-label">Actions</span>
-                    <span class="stat-value" id="action-count">0</span>
-                </div>
-            </div>
-        </div>
-
-        <!-- Resources List -->
-        <div class="card" style="margin-bottom: 30px;">
-            <h2>Resources</h2>
-            <div id="resources-list">
-                <div class="loading">Loading resources...</div>
-            </div>
-        </div>
-
-        <!-- Recent Activity -->
-        <div class="card" style="margin-bottom: 30px;">
-            <h2>Recent Actions</h2>
-            <div id="recent-actions">
-                <div class="empty-state">No recent activity</div>
-            </div>
-            <div style="margin-top: 15px; padding-top: 15px; border-top: 1px solid #e0e0e0;">
-                <a href="/benchmark/latest"
-                   target="_blank"
-                   style="color: #1976d2; text-decoration: none; font-size: 14px;">
-                    📊 View Example Benchmark Results (Jan 16, 2026)
-                </a>
-                <div style="margin-top: 5px; font-size: 12px; color: #666;">
-                    (Run new evaluation to update)
-                </div>
-            </div>
-        </div>
-
-        <!-- Logs -->
-        <div class="card">
-            <h2>Recent Logs</h2>
-            <div id="logs">
-                <div class="empty-state">No logs available</div>
-            </div>
-        </div>
-
-        <div class="refresh-info">
-            Auto-refreshing every 5 seconds | Last updated: <span id="last-updated">-</span>
-        </div>
-    </div>
-
-    <script>
-        let refreshInterval;
-
-        async function fetchDashboardData() {
-            try {
-                const response = await fetch('/api/dashboard');
-                const data = await response.json();
-                updateDashboard(data);
-            } catch (error) {
-                console.error('Failed to fetch dashboard data:', error);
-            }
-        }
-
-        function updateDashboard(data) {
-            // Update costs
-            const costs = data.costs;
-            document.getElementById('cost-hourly').textContent = `$${costs.total_per_hour.toFixed(2)}`;
-            document.getElementById('cost-daily').textContent = `$${costs.total_today.toFixed(2)}`;
-            document.getElementById('cost-weekly').textContent = `$${costs.total_this_week.toFixed(2)}`;
-            document.getElementById('cost-monthly').textContent = `$${costs.total_this_month.toFixed(2)}`;
-
-            // Show alert if costs are high
-            const alert = document.getElementById('cost-alert');
-            if (costs.total_per_hour > 5) {
-                alert.style.display = 'block';
-            } else {
-                alert.style.display = 'none';
-            }
-
-            // Update resource counts
-            const resources = data.resources;
-            const runningVMs = resources.filter(r => r.resource_type === 'vm' && r.status.includes('running')).length;
-            const compute = resources.filter(r => r.resource_type === 'compute').length;
-
-            document.getElementById('count-vms').textContent = runningVMs;
-            document.getElementById('count-compute').textContent = compute;
-            document.getElementById('count-total').textContent = resources.length;
-
-            // Update activity
-            const activity = data.activity;
-            document.getElementById('current-task').textContent = activity.current_task || 'Idle';
-            document.getElementById('task-progress').textContent = activity.task_progress || '-';
-            document.getElementById('action-count').textContent = activity.action_count || 0;
-
-            // Update resources list
-            const resourcesList = document.getElementById('resources-list');
-            if (resources.length === 0) {
-                resourcesList.innerHTML = '<div class="empty-state">No active resources</div>';
-            } else {
-                resourcesList.innerHTML = resources.map(r => `
-                    <div class="resource-item">
-                        <div class="resource-header">
-                            <span class="resource-name">${r.name}</span>
-                            <span class="stat-value status-${r.status.includes('running') ? 'running' : 'stopped'}">
-                                ${r.status}
-                            </span>
-                        </div>
-                        <div class="resource-meta">
-                            ${r.resource_type.toUpperCase()} | ${r.size || 'N/A'} | ${r.location}
-                            | $${r.cost_per_hour.toFixed(2)}/hour
-                        </div>
-                        ${r.status.includes('running') ?
-                            `<button class="btn btn-danger" onclick="stopResource('${r.name}', '${r.resource_type}')">Stop</button>` :
-                            `<button class="btn btn-success" onclick="startResource('${r.name}', '${r.resource_type}')">Start</button>`
-                        }
-                    </div>
-                `).join('');
-            }
-
-            // Update recent actions
-            const actionsDiv = document.getElementById('recent-actions');
-            if (activity.latest_actions && activity.latest_actions.length > 0) {
-                actionsDiv.innerHTML = activity.latest_actions.map(a =>
-                    `<div class="action-log">${a}</div>`
-                ).join('');
-            } else {
-                actionsDiv.innerHTML = '<div class="empty-state">No recent activity</div>';
-            }
-
-            // Update logs
-            const logsDiv = document.getElementById('logs');
-            if (activity.logs && activity.logs.length > 0) {
-                logsDiv.innerHTML = activity.logs.map(l =>
-                    `<div class="action-log">${l}</div>`
-                ).join('');
-            } else {
-                logsDiv.innerHTML = '<div class="empty-state">No logs available</div>';
-            }
-
-            // Update timestamp
-            const timestamp = new Date(data.last_updated);
-            document.getElementById('last-updated').textContent = timestamp.toLocaleTimeString();
-        }
-
-        async function stopResource(name, type) {
-            if (!confirm(`Stop ${name}?`)) return;
-
-            try {
-                const response = await fetch('/api/control', {
-                    method: 'POST',
-                    headers: {'Content-Type': 'application/json'},
-                    body: JSON.stringify({action: 'stop', name, type}),
-                });
-                const result = await response.json();
-                alert(result.message || 'Command sent');
-                fetchDashboardData();  // Refresh
-            } catch (error) {
-                alert('Failed to stop resource: ' + error.message);
-            }
-        }
-
-        async function startResource(name, type) {
-            if (!confirm(`Start ${name}?`)) return;
-
-            try {
-                const response = await fetch('/api/control', {
-                    method: 'POST',
-                    headers: {'Content-Type': 'application/json'},
-                    body: JSON.stringify({action: 'start', name, type}),
-                });
-                const result = await response.json();
-                alert(result.message || 'Command sent');
-                fetchDashboardData();  // Refresh
-            } catch (error) {
-                alert('Failed to start resource: ' + error.message);
-            }
-        }
-
-        // Initial load and auto-refresh
-        fetchDashboardData();
-        refreshInterval = setInterval(fetchDashboardData, 5000);  // Every 5 seconds
-    </script>
-</body>
-</html>
-"""
-
-
-def create_dashboard_app() -> Flask:
-    """Create Flask app for dashboard."""
-    app = Flask(__name__)
-    CORS(app)
-
-    @app.route("/")
-    def index():
-        """Serve dashboard HTML."""
-        return render_template_string(DASHBOARD_HTML)
-
-    @app.route("/api/dashboard")
-    def get_dashboard_data():
-        """Get current dashboard state."""
-        # Update resources in background
-        threading.Thread(target=_update_dashboard_state, daemon=True).start()
-
-        return jsonify(dashboard_state.to_dict())
-
-    @app.route("/api/control", methods=["POST"])
-    def control_resource():
-        """Start or stop a resource."""
-        data = request.json
-        action = data.get("action")
-        name = data.get("name")
-        resource_type = data.get("type")
-
-        if not all([action, name, resource_type]):
-            return jsonify({"error": "Missing required fields"}), 400
-
-        try:
-            if resource_type == "vm":
-                if action == "stop":
-                    subprocess.run(
-                        ["uv", "run", "python", "-m", "openadapt_evals.benchmarks.cli",
-                         "vm-stop", "--vm-name", name, "--no-wait"],
-                        check=True,
-                    )
-                    return jsonify({"message": f"Stop command sent to {name}"})
-                elif action == "start":
-                    subprocess.run(
-                        ["uv", "run", "python", "-m", "openadapt_evals.benchmarks.cli",
-                         "vm-start", "--vm-name", name],
-                        check=True,
-                    )
-                    return jsonify({"message": f"Start command sent to {name}"})
-
-            return jsonify({"error": f"Unsupported action: {action} for {resource_type}"}), 400
-
-        except subprocess.CalledProcessError as e:
-            return jsonify({"error": f"Command failed: {e}"}), 500
-
-    @app.route("/benchmark/latest")
-    def latest_benchmark():
-        """Serve latest benchmark viewer."""
-        viewer_path = Path("/Users/abrichr/oa/src/openadapt-evals/benchmark_results/waa-live_eval_20260116_200004/viewer.html")
-        if viewer_path.exists():
-            return viewer_path.read_text()
-        return "No benchmark results available", 404
-
-    @app.route("/health")
-    def health():
-        """Health check."""
-        return jsonify({"status": "ok"})
-
-    return app
-
-
-def _update_dashboard_state() -> None:
-    """Update dashboard state (run in background thread)."""
-    try:
-        resources = get_azure_resources()
-        dashboard_state.update_resources(resources)
-
-        activity = get_live_activity()
-        dashboard_state.update_activity(activity)
-    except Exception as e:
-        logger.error(f"Failed to update dashboard state: {e}")
-
-
-def run_dashboard_server(port: int = 5555, host: str = "127.0.0.1") -> None:
-    """Run dashboard server (blocking).
-
-    Args:
-        port: Port to run on.
-        host: Host to bind to.
-    """
-    app = create_dashboard_app()
-
-    logger.info(f"Starting dashboard server on {host}:{port}")
-    logger.info(f"Dashboard URL: http://{host}:{port}")
-
-    # Initial state update
-    _update_dashboard_state()
-
-    # Run Flask app
-    app.run(host=host, port=port, debug=False, threaded=True)
-
-
-def is_dashboard_running(port: int = 5555) -> bool:
-    """Check if dashboard server is already running.
-
-    Args:
-        port: Port to check.
-
-    Returns:
-        True if server is running, False otherwise.
-    """
-    try:
-        import requests
-        response = requests.get(f"http://127.0.0.1:{port}/health", timeout=2)
-        return response.status_code == 200
-    except Exception:
-        return False
-
-
-def start_dashboard_background(port: int = 5555) -> None:
-    """Start dashboard server in background thread.
-
-    Args:
-        port: Port to run on.
-    """
-    global _dashboard_server_thread, _dashboard_port
-
-    if is_dashboard_running(port):
-        logger.info(f"Dashboard already running on port {port}")
-        return
-
-    _dashboard_port = port
-
-    def run():
-        run_dashboard_server(port=port)
-
-    _dashboard_server_thread = threading.Thread(target=run, daemon=True)
-    _dashboard_server_thread.start()
-
-    # Wait a moment for server to start
-    time.sleep(2)
-
-    logger.info(f"Dashboard server started on port {port}")
-
-
-def ensure_dashboard_running(auto_open: bool = True, port: int = 5555) -> str:
-    """Ensure dashboard server is running and optionally open browser.
-
-    This is the main entry point for auto-launching the dashboard.
-
-    Args:
-        auto_open: Whether to open browser automatically.
-        port: Port to run on.
-
-    Returns:
-        Dashboard URL.
-    """
-    global _dashboard_url
-    _dashboard_url = f"http://127.0.0.1:{port}"
-
-    # Start server if not running
-    if not is_dashboard_running(port):
-        logger.info("Starting dashboard server...")
-        start_dashboard_background(port)
-
-        # Wait for server to be ready
-        for _ in range(10):
-            if is_dashboard_running(port):
-                break
-            time.sleep(0.5)
-
-    # Open browser
-    if auto_open:
-        logger.info(f"Opening dashboard in browser: {_dashboard_url}")
-        webbrowser.open(_dashboard_url)
-
-    return _dashboard_url
-
-
-def main() -> int:
-    """CLI entry point."""
-    parser = argparse.ArgumentParser(description="Azure Resource Dashboard")
-    parser.add_argument("--port", type=int, default=5555, help="Port to run on")
-    parser.add_argument("--host", type=str, default="127.0.0.1", help="Host to bind to")
-    parser.add_argument("--no-open", action="store_true", help="Don't open browser")
-
-    args = parser.parse_args()
-
-    # Configure logging
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s [%(levelname)s] %(message)s",
-    )
-
-    # Open browser unless disabled
-    if not args.no_open:
-        url = f"http://{args.host}:{args.port}"
-        threading.Timer(1.0, lambda: webbrowser.open(url)).start()
-
-    # Run server (blocking)
-    run_dashboard_server(port=args.port, host=args.host)
-
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/openadapt_evals/benchmarks/generate_synthetic_demos.py b/openadapt_evals/benchmarks/generate_synthetic_demos.py
deleted file mode 100644
index 893db25..0000000
--- a/openadapt_evals/benchmarks/generate_synthetic_demos.py
+++ /dev/null
@@ -1,824 +0,0 @@
-"""Generate synthetic demonstration trajectories for Windows Agent Arena tasks.
-
-This module provides functionality to generate high-quality synthetic demonstration
-trajectories for all 154 WAA tasks. These demos enable demo-conditioned prompting,
-which improves first-action accuracy from 33% to 100%.
-
-The generation uses a hybrid approach:
-1. LLM-based generation for complex, domain-specific trajectories
-2. Rule-based templates for common patterns (open app, type text, save)
-3. Domain knowledge to ensure realistic action sequences
-
-Usage:
-    # Generate demos for all tasks
-    python -m openadapt_evals.benchmarks.generate_synthetic_demos --all
-
-    # Generate for specific domains
-    python -m openadapt_evals.benchmarks.generate_synthetic_demos --domains notepad,browser
-
-    # Generate for specific task IDs
-    python -m openadapt_evals.benchmarks.generate_synthetic_demos --task-ids notepad_1,browser_5
-
-    # Use specific LLM provider
-    python -m openadapt_evals.benchmarks.generate_synthetic_demos --all --provider anthropic
-"""
-
-import argparse
-import json
-import logging
-import os
-import re
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
-
-from anthropic import Anthropic
-from openai import OpenAI
-
-logger = logging.getLogger(__name__)
-
-# WAA domain information
-WAA_DOMAINS = {
-    "notepad": {
-        "description": "Windows Notepad text editor tasks",
-        "common_apps": ["Notepad"],
-        "example_tasks": ["Open file", "Edit text", "Save document", "Find and replace"],
-    },
-    "browser": {
-        "description": "Web browser navigation and interaction (Chrome/Edge)",
-        "common_apps": ["Google Chrome", "Microsoft Edge"],
-        "example_tasks": ["Navigate to URL", "Search", "Bookmark", "Settings"],
-    },
-    "office": {
-        "description": "Office productivity applications (Word, Excel, Outlook)",
-        "common_apps": ["LibreOffice Writer", "LibreOffice Calc", "Microsoft Word"],
-        "example_tasks": ["Create document", "Format text", "Create spreadsheet", "Insert table"],
-    },
-    "coding": {
-        "description": "Programming and development tasks",
-        "common_apps": ["Visual Studio Code", "Notepad++", "Terminal"],
-        "example_tasks": ["Open project", "Edit code", "Run debugger", "Terminal commands"],
-    },
-    "media": {
-        "description": "Media playback and management",
-        "common_apps": ["VLC Media Player", "Windows Media Player"],
-        "example_tasks": ["Play video", "Adjust volume", "Create playlist", "Change subtitle"],
-    },
-    "paint": {
-        "description": "Windows Paint drawing application",
-        "common_apps": ["Paint"],
-        "example_tasks": ["Draw shape", "Fill color", "Add text", "Save image"],
-    },
-    "file_explorer": {
-        "description": "Windows File Explorer file management",
-        "common_apps": ["File Explorer"],
-        "example_tasks": ["Navigate folders", "Create folder", "Copy file", "Rename item"],
-    },
-    "clock": {
-        "description": "Windows Clock application (alarms, timers, stopwatch)",
-        "common_apps": ["Clock"],
-        "example_tasks": ["Set alarm", "Start timer", "Use stopwatch", "World clock"],
-    },
-    "settings": {
-        "description": "Windows Settings application",
-        "common_apps": ["Settings"],
-        "example_tasks": ["Change display", "Network settings", "Sound settings", "Privacy"],
-    },
-    "edge": {
-        "description": "Microsoft Edge browser specific tasks",
-        "common_apps": ["Microsoft Edge"],
-        "example_tasks": ["Manage extensions", "Collections", "Reading mode", "Browser settings"],
-    },
-    "vscode": {
-        "description": "Visual Studio Code IDE tasks",
-        "common_apps": ["Visual Studio Code"],
-        "example_tasks": ["Open workspace", "Install extension", "Debug code", "Git operations"],
-    },
-}
-
-# Common action patterns
-COMMON_PATTERNS = {
-    "open_app": [
-        ("Click Start menu", "CLICK(x=0.02, y=0.98)"),
-        ("Type app name", "TYPE('{app_name}')"),
-        ("Wait for search results", "WAIT(1.0)"),
-        ("Click on app", "CLICK(x=0.15, y=0.3)"),
-    ],
-    "save_file": [
-        ("Open File menu", "HOTKEY('alt', 'f')"),
-        ("Click Save As", "CLICK(x=0.1, y=0.4)"),
-        ("Type filename", "TYPE('{filename}')"),
-        ("Click Save button", "CLICK(x=0.6, y=0.9)"),
-    ],
-    "close_app": [
-        ("Click close button", "CLICK(x=0.99, y=0.02)"),
-        ("Or use hotkey", "HOTKEY('alt', 'f4')"),
-    ],
-}
-
-
-class SyntheticDemoGenerator:
-    """Generates synthetic demonstrations for WAA tasks."""
-
-    def __init__(
-        self,
-        provider: str = "anthropic",
-        model: Optional[str] = None,
-        api_key: Optional[str] = None,
-        output_dir: Optional[Path] = None,
-    ):
-        """Initialize the demo generator.
-
-        Args:
-            provider: LLM provider ("anthropic" or "openai")
-            model: Model name (uses default if not specified)
-            api_key: API key (uses environment variable if not specified)
-            output_dir: Output directory for generated demos
-        """
-        self.provider = provider.lower()
-        self.output_dir = output_dir or Path(__file__).parent.parent.parent / "demo_library" / "synthetic_demos"
-        self.output_dir.mkdir(parents=True, exist_ok=True)
-
-        # Initialize LLM client
-        if self.provider == "anthropic":
-            self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
-            self.model = model or "claude-sonnet-4-5-20250929"
-            self.client = Anthropic(api_key=self.api_key)
-        elif self.provider == "openai":
-            self.api_key = api_key or os.getenv("OPENAI_API_KEY")
-            self.model = model or "gpt-5-turbo-2025-01-01"
-            self.client = OpenAI(api_key=self.api_key)
-        else:
-            raise ValueError(f"Unsupported provider: {provider}")
-
-        logger.info(f"Initialized demo generator with {self.provider}/{self.model}")
-
-    def generate_demo(
-        self,
-        task_id: str,
-        instruction: str,
-        domain: str,
-        use_llm: bool = True,
-    ) -> str:
-        """Generate a synthetic demo for a task.
-
-        Args:
-            task_id: Task identifier (e.g., "notepad_1")
-            instruction: Task instruction text
-            domain: Task domain
-            use_llm: Whether to use LLM generation (vs template-based)
-
-        Returns:
-            Generated demo text in the standard format
-        """
-        logger.info(f"Generating demo for {task_id} ({domain})")
-
-        # Try template-based generation first for simple tasks
-        if not use_llm:
-            demo = self._generate_template_demo(task_id, instruction, domain)
-            if demo:
-                return demo
-
-        # Fall back to LLM generation
-        return self._generate_llm_demo(task_id, instruction, domain)
-
-    def _generate_template_demo(
-        self,
-        task_id: str,
-        instruction: str,
-        domain: str,
-    ) -> Optional[str]:
-        """Generate demo using rule-based templates for simple tasks.
-
-        Args:
-            task_id: Task identifier
-            instruction: Task instruction
-            domain: Task domain
-
-        Returns:
-            Generated demo text or None if template doesn't match
-        """
-        instruction_lower = instruction.lower()
-
-        # Detect simple patterns
-        if "open" in instruction_lower and domain in ["notepad", "paint", "calculator"]:
-            return self._template_open_app(task_id, instruction, domain)
-        elif "type" in instruction_lower and domain == "notepad":
-            return self._template_type_text(task_id, instruction, domain)
-        elif "save" in instruction_lower:
-            return self._template_save_file(task_id, instruction, domain)
-
-        return None
-
-    def _template_open_app(self, task_id: str, instruction: str, domain: str) -> str:
-        """Template for opening an application."""
-        app_name = WAA_DOMAINS.get(domain, {}).get("common_apps", [domain.title()])[0]
-
-        return f"""TASK: {instruction}
-DOMAIN: {domain}
-
-STEPS:
-1. Click on the Start menu button in the taskbar
-   REASONING: Need to access the Start menu to find {app_name}
-   ACTION: CLICK(x=0.02, y=0.98)
-
-2. Type "{app_name.lower()}" in the search box
-   REASONING: Searching is faster than navigating through menus
-   ACTION: TYPE("{app_name.lower()}")
-
-3. Wait for search results to appear
-   REASONING: Windows needs time to index and display results
-   ACTION: WAIT(1.0)
-
-4. Click on the {app_name} app in search results
-   REASONING: {app_name} should appear as the first result
-   ACTION: CLICK(x=0.15, y=0.3)
-
-5. Verify {app_name} window is open
-   REASONING: Confirm the application launched successfully
-   ACTION: DONE()
-
-EXPECTED_OUTCOME: {app_name} application is open and ready to use
-"""
-
-    def _template_type_text(self, task_id: str, instruction: str, domain: str) -> str:
-        """Template for typing text in Notepad."""
-        # Extract text to type from instruction
-        text_match = re.search(r'"([^"]+)"', instruction)
-        text_to_type = text_match.group(1) if text_match else "sample text"
-
-        return f"""TASK: {instruction}
-DOMAIN: {domain}
-
-STEPS:
-1. Click on the Start menu button
-   REASONING: Need to open Notepad first
-   ACTION: CLICK(x=0.02, y=0.98)
-
-2. Type "notepad" in the search box
-   REASONING: Search for Notepad application
-   ACTION: TYPE("notepad")
-
-3. Wait for search results
-   REASONING: Allow Windows to display search results
-   ACTION: WAIT(1.0)
-
-4. Click on Notepad in results
-   REASONING: Launch the application
-   ACTION: CLICK(x=0.15, y=0.3)
-
-5. Wait for Notepad to open
-   REASONING: Allow application to fully load
-   ACTION: WAIT(0.5)
-
-6. Click in the text area
-   REASONING: Set focus to the editing area
-   ACTION: CLICK(x=0.5, y=0.5)
-
-7. Type the text: "{text_to_type}"
-   REASONING: Enter the required text content
-   ACTION: TYPE("{text_to_type}")
-
-8. Verify text is displayed
-   REASONING: Confirm text was entered correctly
-   ACTION: DONE()
-
-EXPECTED_OUTCOME: Notepad is open with "{text_to_type}" displayed
-"""
-
-    def _template_save_file(self, task_id: str, instruction: str, domain: str) -> str:
-        """Template for saving a file."""
-        # Extract filename from instruction
-        filename_match = re.search(r'"([^"]+\.\w+)"', instruction)
-        filename = filename_match.group(1) if filename_match else "document.txt"
-
-        return f"""TASK: {instruction}
-DOMAIN: {domain}
-
-STEPS:
-1. Press Ctrl+S to open Save dialog
-   REASONING: Keyboard shortcut is fastest way to save
-   ACTION: HOTKEY("ctrl", "s")
-
-2. Wait for Save dialog to appear
-   REASONING: Dialog needs time to render
-   ACTION: WAIT(0.5)
-
-3. Type filename in the filename field
-   REASONING: Specify the desired filename
-   ACTION: TYPE("{filename}")
-
-4. Click the Save button
-   REASONING: Confirm and execute the save operation
-   ACTION: CLICK(x=0.7, y=0.9)
-
-5. Wait for file to save
-   REASONING: Allow time for file write operation
-   ACTION: WAIT(0.5)
-
-6. Verify file is saved (title bar shows filename)
-   REASONING: Confirm save was successful
-   ACTION: DONE()
-
-EXPECTED_OUTCOME: File is saved as "{filename}"
-"""
-
-    def _generate_llm_demo(
-        self,
-        task_id: str,
-        instruction: str,
-        domain: str,
-    ) -> str:
-        """Generate demo using LLM.
-
-        Args:
-            task_id: Task identifier
-            instruction: Task instruction
-            domain: Task domain
-
-        Returns:
-            Generated demo text
-        """
-        # Build prompt with domain context and examples
-        domain_info = WAA_DOMAINS.get(domain, {})
-        domain_desc = domain_info.get("description", f"{domain} tasks")
-        common_apps = ", ".join(domain_info.get("common_apps", [domain.title()]))
-
-        prompt = f"""Generate a step-by-step demonstration trajectory for a Windows automation task.
-
-TASK INFORMATION:
-- Task ID: {task_id}
-- Instruction: {instruction}
-- Domain: {domain} ({domain_desc})
-- Common applications: {common_apps}
-
-Generate a detailed demo in this EXACT format:
-
-TASK: {instruction}
-DOMAIN: {domain}
-
-STEPS:
-1. [First step description]
-   REASONING: [Why this step is necessary]
-   ACTION: [Specific action in format ACTION_TYPE(parameters)]
-
-2. [Second step description]
-   REASONING: [Why this step is needed]
-   ACTION: [Action specification]
-
-[Continue with all necessary steps...]
-
-N. [Final step]
-   REASONING: [Completion reasoning]
-   ACTION: DONE()
-
-EXPECTED_OUTCOME: [What should be achieved when complete]
-
-IMPORTANT ACTION FORMAT RULES:
-- CLICK(x=0.5, y=0.5) - normalized coordinates 0.0 to 1.0
-- TYPE("text to type") - text in double quotes
-- HOTKEY("ctrl", "s") - keys separated by commas
-- WAIT(1.0) - time in seconds
-- DRAG(start_x=0.3, start_y=0.4, end_x=0.6, end_y=0.7) - normalized coords
-- RIGHT_CLICK(x=0.5, y=0.5) - right click action
-- SCROLL(direction="down") - scroll direction
-- DONE() - mark task complete
-
-GUIDELINES:
-1. Be specific and actionable - each step should be clearly executable
-2. Use realistic coordinate positions (e.g., Start menu at x=0.02, y=0.98)
-3. Include appropriate WAIT() actions for UI transitions (typically 0.5-2.0 seconds)
-4. Always start by opening the required application if not already open
-5. Provide clear reasoning for each action
-6. Break complex operations into atomic steps
-7. End with DONE() action
-8. Typical demo should be 5-15 steps depending on complexity
-
-Generate the complete demonstration now:"""
-
-        # Call LLM
-        if self.provider == "anthropic":
-            response = self.client.messages.create(
-                model=self.model,
-                max_tokens=2048,
-                messages=[{"role": "user", "content": prompt}],
-            )
-            demo_text = response.content[0].text
-        else:  # openai
-            response = self.client.chat.completions.create(
-                model=self.model,
-                messages=[{"role": "user", "content": prompt}],
-                max_tokens=2048,
-            )
-            demo_text = response.choices[0].message.content
-
-        return demo_text.strip()
-
-    def generate_all_demos(
-        self,
-        task_list: List[Dict[str, Any]],
-        use_llm: bool = True,
-        skip_existing: bool = True,
-    ) -> Dict[str, str]:
-        """Generate demos for all tasks in the list.
-
-        Args:
-            task_list: List of task dictionaries with 'task_id', 'instruction', 'domain'
-            use_llm: Whether to use LLM generation
-            skip_existing: Skip tasks that already have demo files
-
-        Returns:
-            Dictionary mapping task_id to demo file path
-        """
-        results = {}
-        total = len(task_list)
-
-        for i, task in enumerate(task_list, 1):
-            task_id = task["task_id"]
-            instruction = task["instruction"]
-            domain = task["domain"]
-
-            output_file = self.output_dir / f"{task_id}.txt"
-
-            # Skip if already exists
-            if skip_existing and output_file.exists():
-                logger.info(f"[{i}/{total}] Skipping {task_id} (already exists)")
-                results[task_id] = str(output_file)
-                continue
-
-            try:
-                logger.info(f"[{i}/{total}] Generating demo for {task_id}...")
-                demo_text = self.generate_demo(task_id, instruction, domain, use_llm)
-
-                # Save to file
-                output_file.write_text(demo_text, encoding="utf-8")
-                results[task_id] = str(output_file)
-
-                logger.info(f"[{i}/{total}] ✓ Saved to {output_file}")
-
-            except Exception as e:
-                logger.error(f"[{i}/{total}] ✗ Failed to generate demo for {task_id}: {e}")
-                results[task_id] = None
-
-        return results
-
-    def create_demo_index(self, demo_files: Dict[str, str]) -> Dict[str, Any]:
-        """Create a JSON index of all generated demos.
-
-        Args:
-            demo_files: Dictionary mapping task_id to demo file path
-
-        Returns:
-            Index dictionary
-        """
-        index = {
-            "version": "2.0.0",
-            "description": "Synthetic WAA demonstration library for demo-conditioned prompting",
-            "generator": f"{self.provider}/{self.model}",
-            "total_demos": len([p for p in demo_files.values() if p is not None]),
-            "demos": [],
-        }
-
-        for task_id, file_path in demo_files.items():
-            if file_path is None:
-                continue
-
-            # Parse the demo file to extract metadata
-            try:
-                demo_text = Path(file_path).read_text(encoding="utf-8")
-                task_line = [l for l in demo_text.split("\n") if l.startswith("TASK:")][0]
-                domain_line = [l for l in demo_text.split("\n") if l.startswith("DOMAIN:")][0]
-
-                task = task_line.replace("TASK:", "").strip()
-                domain = domain_line.replace("DOMAIN:", "").strip()
-
-                # Count steps
-                step_count = len([l for l in demo_text.split("\n") if l.strip() and l[0].isdigit() and "." in l[:3]])
-
-                index["demos"].append({
-                    "id": task_id,
-                    "task": task,
-                    "domain": domain,
-                    "file": str(Path(file_path).relative_to(self.output_dir.parent)),
-                    "estimated_steps": step_count,
-                })
-            except Exception as e:
-                logger.warning(f"Failed to parse metadata from {file_path}: {e}")
-
-        return index
-
-
-def load_waa_tasks_from_mock() -> List[Dict[str, Any]]:
-    """Generate a comprehensive list of all 154 WAA tasks.
-
-    This creates a complete task list based on known WAA structure:
-    - 11 domains
-    - 154 total tasks
-    - Distribution based on domain complexity
-
-    Returns:
-        List of task dictionaries
-    """
-    # Task distribution per domain (totaling 154)
-    task_counts = {
-        "browser": 20,      # Complex web tasks
-        "office": 25,       # Word, Excel, Outlook tasks
-        "coding": 18,       # VSCode and terminal
-        "media": 10,        # VLC playback
-        "notepad": 15,      # Text editing
-        "paint": 12,        # Drawing tasks
-        "file_explorer": 18, # File operations
-        "clock": 8,         # Alarms, timers
-        "settings": 15,     # System settings
-        "edge": 8,          # Edge-specific
-        "vscode": 5,        # VSCode-specific
-    }
-
-    # Generate task instructions based on domain patterns
-    task_templates = {
-        "browser": [
-            "Navigate to {url}",
-            "Search for '{query}' on Google",
-            "Bookmark the current page",
-            "Open a new tab",
-            "Clear browsing history",
-            "Change homepage settings",
-            "Download {file} from {url}",
-            "Enable/disable extensions",
-            "Open developer tools",
-            "Zoom in/out on page",
-        ],
-        "office": [
-            "Create a new document",
-            "Type '{text}' and save as {filename}",
-            "Format text as bold/italic",
-            "Insert a table with {rows}x{cols}",
-            "Add bullet points",
-            "Set page margins",
-            "Insert an image",
-            "Create a spreadsheet",
-            "Apply formula in Excel",
-            "Send an email with Outlook",
-        ],
-        "coding": [
-            "Open a Python file",
-            "Run a Python script",
-            "Set a breakpoint and debug",
-            "Install an extension",
-            "Use terminal to run '{command}'",
-            "Create a new project",
-            "Use git commit",
-            "Search for text in files",
-            "Format code",
-            "Open settings",
-        ],
-        "media": [
-            "Play a video file",
-            "Pause/resume playback",
-            "Adjust volume to {level}%",
-            "Enable subtitles",
-            "Skip forward {seconds} seconds",
-            "Create a playlist",
-            "Change playback speed",
-            "Take a screenshot",
-            "Full screen mode",
-            "Adjust audio settings",
-        ],
-        "notepad": [
-            "Open Notepad",
-            "Type '{text}' in Notepad",
-            "Save file as {filename}",
-            "Find text '{query}'",
-            "Replace '{old}' with '{new}'",
-            "Change font size",
-            "Enable word wrap",
-            "Print document",
-            "Cut/copy/paste text",
-            "Open recent file",
-        ],
-        "paint": [
-            "Draw a rectangle",
-            "Fill region with color",
-            "Add text to image",
-            "Use pencil tool",
-            "Erase area",
-            "Resize canvas",
-            "Save image as {filename}",
-            "Select and move region",
-            "Change brush size",
-            "Undo last action",
-        ],
-        "file_explorer": [
-            "Open File Explorer",
-            "Navigate to {folder}",
-            "Create new folder {name}",
-            "Rename file to {new_name}",
-            "Copy file to {destination}",
-            "Delete {file}",
-            "Search for files containing '{query}'",
-            "Change view mode",
-            "Sort files by date",
-            "Show hidden files",
-        ],
-        "clock": [
-            "Set alarm for {time}",
-            "Start a {duration} timer",
-            "Use stopwatch",
-            "Add world clock for {city}",
-            "Delete an alarm",
-            "Edit timer duration",
-            "Pause stopwatch",
-            "Set alarm sound",
-        ],
-        "settings": [
-            "Change display brightness",
-            "Connect to WiFi network",
-            "Adjust sound volume",
-            "Change desktop background",
-            "Modify privacy settings",
-            "Update Windows",
-            "Add Bluetooth device",
-            "Change power settings",
-            "Set default apps",
-            "Manage storage",
-        ],
-        "edge": [
-            "Open Edge browser",
-            "Add site to collections",
-            "Enable reading mode",
-            "Clear cache and cookies",
-            "Change default search engine",
-            "Manage passwords",
-            "Open InPrivate window",
-            "Pin tab",
-        ],
-        "vscode": [
-            "Open VS Code workspace",
-            "Install Python extension",
-            "Run debugger",
-            "Use git integration",
-            "Format document",
-        ],
-    }
-
-    tasks = []
-    for domain, count in task_counts.items():
-        templates = task_templates.get(domain, [f"Perform task in {domain}"])
-
-        for i in range(1, count + 1):
-            # Cycle through templates
-            template = templates[(i - 1) % len(templates)]
-
-            # Fill in template variables with generic values
-            instruction = template.format(
-                url="example.com",
-                query="sample query",
-                file="document.txt",
-                filename=f"file_{i}.txt",
-                text=f"Sample text {i}",
-                rows=3, cols=4,
-                command="python script.py",
-                level=50,
-                seconds=10,
-                old="old text",
-                new="new text",
-                folder="Documents",
-                name=f"folder_{i}",
-                new_name=f"renamed_{i}.txt",
-                destination="Desktop",
-                time="8:00 AM",
-                duration="5 minutes",
-                city="London",
-            )
-
-            tasks.append({
-                "task_id": f"{domain}_{i}",
-                "instruction": instruction,
-                "domain": domain,
-            })
-
-    logger.info(f"Generated {len(tasks)} synthetic task definitions")
-    return tasks
-
-
-def main():
-    """Main CLI entry point."""
-    parser = argparse.ArgumentParser(
-        description="Generate synthetic demonstration trajectories for WAA tasks"
-    )
-    parser.add_argument(
-        "--all",
-        action="store_true",
-        help="Generate demos for all 154 WAA tasks",
-    )
-    parser.add_argument(
-        "--domains",
-        type=str,
-        help="Comma-separated list of domains to generate (e.g., 'notepad,browser')",
-    )
-    parser.add_argument(
-        "--task-ids",
-        type=str,
-        help="Comma-separated list of specific task IDs (e.g., 'notepad_1,browser_5')",
-    )
-    parser.add_argument(
-        "--provider",
-        type=str,
-        default="anthropic",
-        choices=["anthropic", "openai"],
-        help="LLM provider to use for generation",
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        help="Model name (uses provider default if not specified)",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        help="Output directory for demos (default: demo_library/synthetic_demos)",
-    )
-    parser.add_argument(
-        "--no-llm",
-        action="store_true",
-        help="Use only template-based generation (no LLM calls)",
-    )
-    parser.add_argument(
-        "--skip-existing",
-        action="store_true",
-        default=True,
-        help="Skip tasks that already have demo files",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="Enable verbose logging",
-    )
-
-    args = parser.parse_args()
-
-    # Setup logging
-    logging.basicConfig(
-        level=logging.DEBUG if args.verbose else logging.INFO,
-        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-    )
-
-    # Initialize generator
-    generator = SyntheticDemoGenerator(
-        provider=args.provider,
-        model=args.model,
-        output_dir=args.output_dir,
-    )
-
-    # Load task list
-    all_tasks = load_waa_tasks_from_mock()
-
-    # Filter tasks based on arguments
-    if args.task_ids:
-        task_ids = set(args.task_ids.split(","))
-        tasks = [t for t in all_tasks if t["task_id"] in task_ids]
-        logger.info(f"Generating demos for {len(tasks)} specific tasks")
-    elif args.domains:
-        domains = set(args.domains.split(","))
-        tasks = [t for t in all_tasks if t["domain"] in domains]
-        logger.info(f"Generating demos for {len(tasks)} tasks in domains: {domains}")
-    elif args.all:
-        tasks = all_tasks
-        logger.info(f"Generating demos for all {len(tasks)} WAA tasks")
-    else:
-        logger.error("Must specify --all, --domains, or --task-ids")
-        return 1
-
-    # Generate demos
-    logger.info(f"Starting demo generation using {args.provider}...")
-    demo_files = generator.generate_all_demos(
-        tasks,
-        use_llm=not args.no_llm,
-        skip_existing=args.skip_existing,
-    )
-
-    # Create index
-    logger.info("Creating demo index...")
-    index = generator.create_demo_index(demo_files)
-    index_path = generator.output_dir / "demos.json"
-    with open(index_path, "w", encoding="utf-8") as f:
-        json.dump(index, f, indent=2, ensure_ascii=False)
-
-    # Summary
-    successful = sum(1 for p in demo_files.values() if p is not None)
-    failed = len(demo_files) - successful
-
-    logger.info("\n" + "=" * 60)
-    logger.info("DEMO GENERATION COMPLETE")
-    logger.info("=" * 60)
-    logger.info(f"Total tasks: {len(demo_files)}")
-    logger.info(f"Successful: {successful}")
-    logger.info(f"Failed: {failed}")
-    logger.info(f"Output directory: {generator.output_dir}")
-    logger.info(f"Index file: {index_path}")
-    logger.info("=" * 60)
-
-    return 0 if failed == 0 else 1
-
-
-if __name__ == "__main__":
-    exit(main())
diff --git a/openadapt_evals/benchmarks/live_api.py b/openadapt_evals/benchmarks/live_api.py
deleted file mode 100644
index 3164dca..0000000
--- a/openadapt_evals/benchmarks/live_api.py
+++ /dev/null
@@ -1,110 +0,0 @@
-"""Simple API server for live benchmark monitoring.
-
-This module provides a Flask API endpoint that serves the benchmark_live.json
-file for the viewer to poll.
-
-Usage:
-    # Start the API server
-    python -m openadapt_evals.benchmarks.live_api
-
-    # Or with custom port
-    python -m openadapt_evals.benchmarks.live_api --port 5001
-
-    # Then open viewer in browser at http://localhost:5001
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import logging
-from pathlib import Path
-
-from flask import Flask, jsonify, send_file
-from flask_cors import CORS
-
-logger = logging.getLogger(__name__)
-
-# Create Flask app
-app = Flask(__name__)
-CORS(app)  # Enable CORS for local development
-
-# Configuration
-LIVE_FILE = Path("benchmark_live.json")
-
-
-@app.route("/api/benchmark-live")
-def get_benchmark_live():
-    """Get current live benchmark status."""
-    try:
-        if LIVE_FILE.exists():
-            with open(LIVE_FILE) as f:
-                data = json.load(f)
-            return jsonify(data)
-        else:
-            return jsonify({"status": "no_data", "message": "No live tracking data available"})
-    except Exception as e:
-        logger.error(f"Error reading live tracking file: {e}")
-        return jsonify({"status": "error", "message": str(e)}), 500
-
-
-@app.route("/")
-def index():
-    """Serve the benchmark viewer HTML."""
-    viewer_path = Path(__file__).parent.parent.parent / "benchmark_results" / "viewer.html"
-
-    if viewer_path.exists():
-        return send_file(viewer_path)
-    else:
-        return """
-        <html>
-        <head><title>Live Benchmark Viewer</title></head>
-        <body>
-            <h1>Live Benchmark Viewer</h1>
-            <p>No viewer.html found. Generate one with:</p>
-            <pre>uv run python -m openadapt_evals.benchmarks.cli view --run-name {run_name}</pre>
-            <p>Or access the API directly:</p>
-            <ul>
-                <li><a href="/api/benchmark-live">/api/benchmark-live</a> - Current live status</li>
-            </ul>
-        </body>
-        </html>
-        """
-
-
-@app.route("/health")
-def health():
-    """Health check endpoint."""
-    return jsonify({"status": "ok"})
-
-
-def main():
-    """Run the API server."""
-    parser = argparse.ArgumentParser(description="Live benchmark API server")
-    parser.add_argument("--port", type=int, default=5001, help="Port to run on")
-    parser.add_argument("--host", type=str, default="127.0.0.1", help="Host to bind to")
-    parser.add_argument("--live-file", type=str, help="Path to benchmark_live.json")
-    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
-
-    args = parser.parse_args()
-
-    # Set live file path
-    global LIVE_FILE
-    if args.live_file:
-        LIVE_FILE = Path(args.live_file)
-
-    # Configure logging
-    logging.basicConfig(
-        level=logging.DEBUG if args.debug else logging.INFO,
-        format="%(asctime)s [%(levelname)s] %(message)s",
-    )
-
-    logger.info(f"Starting live benchmark API server on {args.host}:{args.port}")
-    logger.info(f"Monitoring file: {LIVE_FILE.absolute()}")
-    logger.info(f"API endpoint: http://{args.host}:{args.port}/api/benchmark-live")
-
-    app.run(host=args.host, port=args.port, debug=args.debug)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/openadapt_evals/benchmarks/validate_demos.py b/openadapt_evals/benchmarks/validate_demos.py
deleted file mode 100644
index 366a4c2..0000000
--- a/openadapt_evals/benchmarks/validate_demos.py
+++ /dev/null
@@ -1,339 +0,0 @@
-"""Validate synthetic demo format and compatibility with APIBenchmarkAgent.
-
-This script validates that generated demos:
-1. Follow the correct format (TASK, DOMAIN, STEPS, EXPECTED_OUTCOME)
-2. Have valid action syntax
-3. Are compatible with ApiAgent demo loading
-4. Can be parsed correctly
-
-Usage:
-    python -m openadapt_evals.benchmarks.validate_demos --demo-dir demo_library/synthetic_demos
-    python -m openadapt_evals.benchmarks.validate_demos --demo-file demo_library/synthetic_demos/notepad_1.txt
-"""
-
-import argparse
-import json
-import logging
-import re
-from pathlib import Path
-from typing import Dict, List, Tuple
-
-logger = logging.getLogger(__name__)
-
-# Valid action types
-VALID_ACTIONS = [
-    "CLICK",
-    "RIGHT_CLICK",
-    "DOUBLE_CLICK",
-    "TRIPLE_CLICK",
-    "TYPE",
-    "HOTKEY",
-    "WAIT",
-    "DRAG",
-    "HOVER",
-    "SCROLL",
-    "DONE",
-    "FAIL",
-]
-
-
-class DemoValidator:
-    """Validates demo file format and content."""
-
-    def __init__(self):
-        self.errors: List[str] = []
-        self.warnings: List[str] = []
-
-    def validate_demo(self, demo_path: Path) -> Tuple[bool, List[str], List[str]]:
-        """Validate a single demo file.
-
-        Args:
-            demo_path: Path to demo file
-
-        Returns:
-            Tuple of (is_valid, errors, warnings)
-        """
-        self.errors = []
-        self.warnings = []
-
-        try:
-            content = demo_path.read_text(encoding="utf-8")
-        except Exception as e:
-            self.errors.append(f"Failed to read file: {e}")
-            return False, self.errors, self.warnings
-
-        # Check required sections
-        if not content.startswith("TASK:"):
-            self.errors.append("Demo must start with 'TASK:' line")
-
-        if "DOMAIN:" not in content:
-            self.errors.append("Missing 'DOMAIN:' section")
-
-        if "STEPS:" not in content:
-            self.errors.append("Missing 'STEPS:' section")
-
-        if "EXPECTED_OUTCOME:" not in content:
-            self.warnings.append("Missing 'EXPECTED_OUTCOME:' section")
-
-        # Extract and validate steps
-        steps_section = self._extract_section(content, "STEPS:", "EXPECTED_OUTCOME:")
-        if steps_section:
-            self._validate_steps(steps_section)
-        else:
-            self.errors.append("Could not extract STEPS section")
-
-        # Check for DONE() action
-        if "DONE()" not in content:
-            self.errors.append("Demo must end with DONE() action")
-
-        is_valid = len(self.errors) == 0
-        return is_valid, self.errors, self.warnings
-
-    def _extract_section(self, content: str, start_marker: str, end_marker: str) -> str:
-        """Extract a section between two markers."""
-        try:
-            start_idx = content.index(start_marker) + len(start_marker)
-            if end_marker:
-                end_idx = content.index(end_marker)
-                return content[start_idx:end_idx].strip()
-            else:
-                return content[start_idx:].strip()
-        except ValueError:
-            return ""
-
-    def _validate_steps(self, steps_content: str) -> None:
-        """Validate the STEPS section."""
-        lines = steps_content.split("\n")
-        step_numbers = []
-        actions = []
-
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-
-            # Check for step numbers (e.g., "1. ", "2. ", etc.)
-            step_match = re.match(r"^(\d+)\.\s+(.+)", line)
-            if step_match:
-                step_num = int(step_match.group(1))
-                step_numbers.append(step_num)
-                continue
-
-            # Check for ACTION: lines
-            if line.startswith("ACTION:"):
-                action_str = line[7:].strip()
-                actions.append(action_str)
-                self._validate_action(action_str)
-
-        # Validate step numbering
-        if step_numbers:
-            expected = list(range(1, len(step_numbers) + 1))
-            if step_numbers != expected:
-                self.errors.append(
-                    f"Step numbering is not sequential: {step_numbers} (expected {expected})"
-                )
-        else:
-            self.warnings.append("No numbered steps found")
-
-        # Validate actions
-        if not actions:
-            self.errors.append("No ACTION statements found")
-        elif actions[-1] != "DONE()":
-            self.warnings.append("Last action should be DONE()")
-
-    def _validate_action(self, action_str: str) -> None:
-        """Validate an individual action string."""
-        # Extract action type
-        action_type_match = re.match(r"^([A-Z_]+)\(", action_str)
-        if not action_type_match:
-            self.errors.append(f"Invalid action format: {action_str}")
-            return
-
-        action_type = action_type_match.group(1)
-        if action_type not in VALID_ACTIONS:
-            self.errors.append(f"Unknown action type: {action_type}")
-
-        # Validate specific action formats
-        if action_type == "CLICK":
-            if not re.match(r"CLICK\(x=[\d.]+,\s*y=[\d.]+\)", action_str):
-                self.errors.append(f"Invalid CLICK format: {action_str}")
-                return
-            # Check coordinate ranges
-            x_match = re.search(r"x=([\d.]+)", action_str)
-            y_match = re.search(r"y=([\d.]+)", action_str)
-            if x_match and y_match:
-                x = float(x_match.group(1))
-                y = float(y_match.group(1))
-                if not (0.0 <= x <= 1.0) or not (0.0 <= y <= 1.0):
-                    self.warnings.append(
-                        f"Coordinates outside normalized range [0,1]: {action_str}"
-                    )
-
-        elif action_type in ["RIGHT_CLICK", "DOUBLE_CLICK", "HOVER"]:
-            if not re.match(rf"{action_type}\(x=[\d.]+,\s*y=[\d.]+\)", action_str):
-                self.errors.append(f"Invalid {action_type} format: {action_str}")
-
-        elif action_type == "TYPE":
-            if not re.match(r'TYPE\(".*"\)', action_str):
-                self.errors.append(f"Invalid TYPE format (should use double quotes): {action_str}")
-
-        elif action_type == "HOTKEY":
-            # Support both formats: HOTKEY("ctrl+s") and HOTKEY("ctrl", "s")
-            # Also support special chars like "+", "-", etc.
-            if not (re.match(r'HOTKEY\("[\w\s,+\-=]+"\)', action_str) or
-                    re.match(r'HOTKEY\("[\w\-+=]+"\s*(,\s*"[\w\-+=]+"\s*)*\)', action_str)):
-                self.errors.append(f"Invalid HOTKEY format: {action_str}")
-
-        elif action_type == "WAIT":
-            if not re.match(r"WAIT\([\d.]+\)", action_str):
-                self.errors.append(f"Invalid WAIT format: {action_str}")
-
-        elif action_type == "DRAG":
-            if not re.match(
-                r"DRAG\(start_x=[\d.]+,\s*start_y=[\d.]+,\s*end_x=[\d.]+,\s*end_y=[\d.]+\)",
-                action_str,
-            ):
-                self.errors.append(f"Invalid DRAG format: {action_str}")
-
-        elif action_type == "SCROLL":
-            if not re.match(r'SCROLL\(direction="(up|down|left|right)"\)', action_str):
-                self.errors.append(f"Invalid SCROLL format: {action_str}")
-
-        elif action_type in ["DONE", "FAIL"]:
-            if action_str not in [f"{action_type}()"]:
-                self.errors.append(f"Invalid {action_type} format (should be '{action_type}()'): {action_str}")
-
-
-def validate_all_demos(demo_dir: Path) -> Dict[str, Dict]:
-    """Validate all demos in a directory.
-
-    Args:
-        demo_dir: Directory containing demo files
-
-    Returns:
-        Dictionary mapping demo_id to validation results
-    """
-    validator = DemoValidator()
-    results = {}
-
-    demo_files = sorted(demo_dir.glob("*.txt"))
-    logger.info(f"Validating {len(demo_files)} demos in {demo_dir}")
-
-    for demo_file in demo_files:
-        demo_id = demo_file.stem
-        is_valid, errors, warnings = validator.validate_demo(demo_file)
-
-        results[demo_id] = {
-            "valid": is_valid,
-            "errors": errors,
-            "warnings": warnings,
-            "file": str(demo_file),
-        }
-
-        if is_valid:
-            if warnings:
-                logger.info(f"✓ {demo_id} (valid with {len(warnings)} warnings)")
-            else:
-                logger.info(f"✓ {demo_id} (valid)")
-        else:
-            logger.error(f"✗ {demo_id} (invalid - {len(errors)} errors)")
-
-    return results
-
-
-def main():
-    """Main CLI entry point."""
-    parser = argparse.ArgumentParser(description="Validate synthetic demo files")
-    parser.add_argument(
-        "--demo-dir",
-        type=Path,
-        help="Directory containing demo files to validate",
-    )
-    parser.add_argument(
-        "--demo-file",
-        type=Path,
-        help="Specific demo file to validate",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="Show detailed error messages",
-    )
-    parser.add_argument(
-        "--json-output",
-        type=Path,
-        help="Save validation results to JSON file",
-    )
-
-    args = parser.parse_args()
-
-    # Setup logging
-    logging.basicConfig(
-        level=logging.DEBUG if args.verbose else logging.INFO,
-        format="%(message)s",
-    )
-
-    # Validate demos
-    if args.demo_file:
-        # Validate single file
-        validator = DemoValidator()
-        is_valid, errors, warnings = validator.validate_demo(args.demo_file)
-
-        print(f"\nValidation Results for {args.demo_file.name}:")
-        print(f"{'=' * 60}")
-        print(f"Valid: {is_valid}")
-
-        if errors:
-            print(f"\nErrors ({len(errors)}):")
-            for error in errors:
-                print(f"  - {error}")
-
-        if warnings:
-            print(f"\nWarnings ({len(warnings)}):")
-            for warning in warnings:
-                print(f"  - {warning}")
-
-        return 0 if is_valid else 1
-
-    elif args.demo_dir:
-        # Validate directory
-        results = validate_all_demos(args.demo_dir)
-
-        # Summary
-        total = len(results)
-        valid = sum(1 for r in results.values() if r["valid"])
-        invalid = total - valid
-
-        print(f"\n{'=' * 60}")
-        print("VALIDATION SUMMARY")
-        print(f"{'=' * 60}")
-        print(f"Total demos: {total}")
-        print(f"Valid: {valid} ({valid/total*100:.1f}%)")
-        print(f"Invalid: {invalid} ({invalid/total*100:.1f}%)")
-
-        # Show errors if verbose
-        if args.verbose and invalid > 0:
-            print(f"\n{'=' * 60}")
-            print("DETAILED ERRORS")
-            print(f"{'=' * 60}")
-            for demo_id, result in results.items():
-                if not result["valid"]:
-                    print(f"\n{demo_id}:")
-                    for error in result["errors"]:
-                        print(f"  - {error}")
-
-        # Save JSON output
-        if args.json_output:
-            with open(args.json_output, "w") as f:
-                json.dump(results, f, indent=2)
-            print(f"\nValidation results saved to {args.json_output}")
-
-        return 0 if invalid == 0 else 1
-
-    else:
-        parser.print_help()
-        return 1
-
-
-if __name__ == "__main__":
-    exit(main())
diff --git a/openadapt_evals/benchmarks/validate_screenshots.py b/openadapt_evals/benchmarks/validate_screenshots.py
deleted file mode 100644
index b709cb0..0000000
--- a/openadapt_evals/benchmarks/validate_screenshots.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""Simple screenshot validation - detect blank/idle screenshots."""
-
-from pathlib import Path
-from PIL import Image
-import numpy as np
-
-
-def validate_screenshot(path: str, min_variance: float = 100.0) -> tuple[bool, str]:
-    """Check if screenshot shows real content (not blank/idle).
-    
-    Args:
-        path: Path to screenshot file
-        min_variance: Minimum pixel variance threshold (default 100)
-        
-    Returns:
-        (is_valid, reason) tuple
-    """
-    try:
-        img = Image.open(path).convert('L')  # Grayscale
-        arr = np.array(img)
-        variance = float(arr.var())
-        
-        if variance < min_variance:
-            return False, f"Low variance ({variance:.1f}) - likely idle/blank"
-        return True, f"OK (variance: {variance:.1f})"
-    except Exception as e:
-        return False, f"Error: {e}"
-
-
-def validate_directory(dir_path: str) -> dict[str, tuple[bool, str]]:
-    """Validate all screenshots in a directory.
-    
-    Args:
-        dir_path: Path to directory containing screenshots
-        
-    Returns:
-        Dict mapping filename to (is_valid, reason) tuple
-    """
-    results = {}
-    path = Path(dir_path)
-    
-    for ext in ['*.png', '*.jpg', '*.jpeg']:
-        for f in path.glob(ext):
-            results[f.name] = validate_screenshot(str(f))
-    
-    return results
-
-
-def summarize_results(results: dict[str, tuple[bool, str]]) -> dict:
-    """Summarize validation results.
-    
-    Returns:
-        Dict with total, valid, invalid counts and list of invalid files
-    """
-    valid = [k for k, (v, _) in results.items() if v]
-    invalid = [k for k, (v, _) in results.items() if not v]
-    
-    return {
-        'total': len(results),
-        'valid': len(valid),
-        'invalid': len(invalid),
-        'invalid_files': invalid,
-    }
-
-
-if __name__ == '__main__':
-    import sys
-    if len(sys.argv) < 2:
-        print("Usage: python validate_screenshots.py <directory>")
-        sys.exit(1)
-    
-    results = validate_directory(sys.argv[1])
-    summary = summarize_results(results)
-    
-    print(f"Validated {summary['total']} screenshots:")
-    print(f"  Valid: {summary['valid']}")
-    print(f"  Invalid: {summary['invalid']}")
-    
-    if summary['invalid_files']:
-        print("\nInvalid files:")
-        for f in summary['invalid_files']:
-            print(f"  - {f}: {results[f][1]}")
diff --git a/openadapt_evals/benchmarks/waa.py b/openadapt_evals/benchmarks/waa.py
deleted file mode 100644
index d4fe10b..0000000
--- a/openadapt_evals/benchmarks/waa.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""DEPRECATED: Import from openadapt_evals.adapters instead.
-
-This module is kept for backward compatibility only.
-All classes are re-exported from openadapt_evals.adapters.waa.
-"""
-
-import warnings
-
-warnings.warn(
-    "openadapt_evals.benchmarks.waa is deprecated. "
-    "Please import from openadapt_evals.adapters instead.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# Re-export from canonical location
-from openadapt_evals.adapters.waa import (
-    WAA_DOMAINS,
-    WAAAdapter,
-    WAAConfig,
-    WAAMockAdapter,
-)
-
-__all__ = [
-    "WAA_DOMAINS",
-    "WAAAdapter",
-    "WAAConfig",
-    "WAAMockAdapter",
-]
diff --git a/openadapt_evals/benchmarks/waa_live.py b/openadapt_evals/benchmarks/waa_live.py
deleted file mode 100644
index b6c3408..0000000
--- a/openadapt_evals/benchmarks/waa_live.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""DEPRECATED: Import from openadapt_evals.adapters instead.
-
-This module is kept for backward compatibility only.
-All classes are re-exported from openadapt_evals.adapters.waa_live.
-"""
-
-import warnings
-
-warnings.warn(
-    "openadapt_evals.benchmarks.waa_live is deprecated. "
-    "Please import from openadapt_evals.adapters instead.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# Re-export from canonical location
-from openadapt_evals.adapters.waa_live import (
-    WAALiveAdapter,
-    WAALiveConfig,
-)
-
-__all__ = [
-    "WAALiveAdapter",
-    "WAALiveConfig",
-]
diff --git a/openadapt_evals/cli/__init__.py b/openadapt_evals/cli/__init__.py
new file mode 100644
index 0000000..f58ac6a
--- /dev/null
+++ b/openadapt_evals/cli/__init__.py
@@ -0,0 +1,14 @@
+"""OpenAdapt CLI module.
+
+This module provides the `oa` command-line interface:
+- `oa evals` - Benchmark evaluation commands
+
+Example:
+    oa evals vm setup           # Setup Azure VM
+    oa evals run --agent gpt-4o # Run evaluation
+    oa evals view               # View results
+"""
+
+from openadapt_evals.cli.main import main
+
+__all__ = ["main"]
diff --git a/openadapt_evals/cli/main.py b/openadapt_evals/cli/main.py
new file mode 100644
index 0000000..b331b81
--- /dev/null
+++ b/openadapt_evals/cli/main.py
@@ -0,0 +1,280 @@
+"""Main CLI entry point for OpenAdapt.
+
+This provides the `oa` command with namespaced subcommands:
+- `oa evals` - Benchmark evaluation commands (VM, run, view, etc.)
+
+Future:
+- `oa ml` - ML training commands (provided by openadapt-ml)
+
+Usage:
+    oa evals vm setup           # Setup Azure VM with WAA
+    oa evals vm status          # Check VM status
+    oa evals run --agent gpt-4o # Run live evaluation
+    oa evals mock --tasks 10    # Run mock evaluation
+    oa evals view               # View results
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+
+def main(argv: list[str] | None = None) -> int:
+    """Main entry point for the `oa` CLI."""
+    parser = argparse.ArgumentParser(
+        prog="oa",
+        description="OpenAdapt CLI - GUI agent benchmark toolkit",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    oa evals vm setup           # Setup Azure VM with WAA
+    oa evals run --agent gpt-4o # Run live evaluation
+    oa evals mock --tasks 10    # Run mock evaluation
+    oa evals view               # View results
+""",
+    )
+
+    subparsers = parser.add_subparsers(dest="namespace", help="Command namespace")
+
+    # Register 'evals' namespace
+    evals_parser = subparsers.add_parser(
+        "evals",
+        help="Benchmark evaluation commands",
+        description="Commands for running GUI agent benchmark evaluations",
+    )
+    _register_evals_commands(evals_parser)
+
+    args = parser.parse_args(argv)
+
+    if args.namespace is None:
+        parser.print_help()
+        return 0
+
+    if args.namespace == "evals":
+        return _dispatch_evals(args)
+
+    return 0
+
+
+def _register_evals_commands(parser: argparse.ArgumentParser) -> None:
+    """Register all evaluation commands under 'oa evals'."""
+    subparsers = parser.add_subparsers(dest="command", help="Evaluation command")
+
+    # VM management commands
+    vm_parser = subparsers.add_parser(
+        "vm",
+        help="VM management commands",
+        description="Azure VM lifecycle and management",
+    )
+    _register_vm_commands(vm_parser)
+
+    # Run evaluation
+    run_parser = subparsers.add_parser(
+        "run",
+        help="Run live evaluation against WAA server",
+        description="Run evaluation against a live WAA server",
+    )
+    run_parser.add_argument("--agent", default="gpt-4o", help="Agent type")
+    run_parser.add_argument("--server", default="http://localhost:5001", help="WAA server URL")
+    run_parser.add_argument("--tasks", type=int, help="Number of tasks (or use --task-ids)")
+    run_parser.add_argument("--task-ids", help="Comma-separated task IDs")
+    run_parser.add_argument("--output", "-o", help="Output directory")
+    run_parser.add_argument("--run-name", help="Run name for results")
+    run_parser.add_argument("--demo", help="Demo text or file path")
+    run_parser.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
+
+    # Mock evaluation
+    mock_parser = subparsers.add_parser(
+        "mock",
+        help="Run mock evaluation (no VM required)",
+        description="Run evaluation with mock adapter for testing",
+    )
+    mock_parser.add_argument("--tasks", type=int, default=10, help="Number of tasks")
+    mock_parser.add_argument("--agent", default="mock", help="Agent type")
+    mock_parser.add_argument("--output", "-o", help="Output directory")
+    mock_parser.add_argument("--run-name", help="Run name for results")
+    mock_parser.add_argument("--demo", help="Demo text or file path")
+    mock_parser.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
+
+    # Probe server
+    probe_parser = subparsers.add_parser(
+        "probe",
+        help="Check if WAA server is ready",
+        description="Probe WAA server health endpoint",
+    )
+    probe_parser.add_argument("--server", default="http://localhost:5001", help="WAA server URL")
+    probe_parser.add_argument("--wait", action="store_true", help="Wait for server to be ready")
+    probe_parser.add_argument("--timeout", type=int, default=300, help="Timeout in seconds")
+
+    # View results
+    view_parser = subparsers.add_parser(
+        "view",
+        help="Generate results viewer",
+        description="Generate HTML viewer for evaluation results",
+    )
+    view_parser.add_argument("--run-name", help="Run name to view")
+    view_parser.add_argument("--output", "-o", default="benchmark_results", help="Results directory")
+    view_parser.add_argument("--port", type=int, default=9000, help="Server port")
+    view_parser.add_argument("--no-open", action="store_true", help="Don't open browser")
+
+    # List tasks
+    tasks_parser = subparsers.add_parser(
+        "tasks",
+        help="List available benchmark tasks",
+        description="List available WAA benchmark tasks",
+    )
+    tasks_parser.add_argument("--domain", help="Filter by domain")
+
+
+def _register_vm_commands(parser: argparse.ArgumentParser) -> None:
+    """Register VM management commands under 'oa evals vm'."""
+    subparsers = parser.add_subparsers(dest="vm_action", help="VM action")
+
+    # Setup (create + configure)
+    setup_parser = subparsers.add_parser("setup", help="Full VM setup with WAA")
+    setup_parser.add_argument("--vm-name", default="waa-eval-vm", help="VM name")
+    setup_parser.add_argument("--resource-group", default="openadapt-agents", help="Resource group")
+    setup_parser.add_argument("--vm-size", default="Standard_D8ds_v5", help="VM size")
+    setup_parser.add_argument("--location", default="eastus", help="Azure region")
+
+    # Status
+    subparsers.add_parser("status", help="Show VM status")
+
+    # Start/stop
+    subparsers.add_parser("start", help="Start deallocated VM")
+    subparsers.add_parser("stop", help="Stop VM")
+    subparsers.add_parser("deallocate", help="Deallocate VM (stops billing)")
+
+    # Delete
+    delete_parser = subparsers.add_parser("delete", help="Delete VM and resources")
+    delete_parser.add_argument("-y", "--yes", action="store_true", help="Skip confirmation")
+
+    # Probe
+    probe_parser = subparsers.add_parser("probe", help="Check WAA server status")
+    probe_parser.add_argument("--wait", action="store_true", help="Wait for ready")
+
+    # Logs
+    logs_parser = subparsers.add_parser("logs", help="View container logs")
+    logs_parser.add_argument("--lines", type=int, default=100, help="Number of lines")
+    logs_parser.add_argument("--follow", "-f", action="store_true", help="Follow log output")
+
+    # Diagnostics
+    subparsers.add_parser("diag", help="Show VM diagnostic info")
+
+    # SSH
+    subparsers.add_parser("ssh", help="Open SSH session to VM")
+
+    # VNC
+    subparsers.add_parser("vnc", help="Open VNC viewer")
+
+    # Exec
+    exec_parser = subparsers.add_parser("exec", help="Run command on VM")
+    exec_parser.add_argument("--cmd", required=True, help="Command to run")
+
+    # Monitor
+    monitor_parser = subparsers.add_parser("monitor", help="Start monitoring dashboard")
+    monitor_parser.add_argument("--details", action="store_true", help="Show detailed info")
+    monitor_parser.add_argument("--auto-shutdown-hours", type=float, help="Auto-shutdown after N hours")
+
+
+def _dispatch_evals(args: argparse.Namespace) -> int:
+    """Dispatch evaluation commands."""
+    if args.command is None:
+        print("Usage: oa evals <command>")
+        print("Commands: vm, run, mock, probe, view, tasks")
+        print("Use 'oa evals <command> --help' for more info")
+        return 0
+
+    if args.command == "vm":
+        return _dispatch_vm(args)
+    elif args.command == "mock":
+        return _cmd_mock(args)
+    elif args.command == "run":
+        return _cmd_run(args)
+    elif args.command == "probe":
+        return _cmd_probe(args)
+    elif args.command == "view":
+        return _cmd_view(args)
+    elif args.command == "tasks":
+        return _cmd_tasks(args)
+
+    print(f"Unknown command: {args.command}")
+    return 1
+
+
+def _dispatch_vm(args: argparse.Namespace) -> int:
+    """Dispatch VM commands."""
+    if args.vm_action is None:
+        print("Usage: oa evals vm <action>")
+        print("Actions: setup, status, start, stop, deallocate, delete, probe, logs, diag, ssh, vnc, exec, monitor")
+        return 0
+
+    # Import VM commands lazily to avoid slow startup
+    from openadapt_evals.cli import vm
+
+    action = args.vm_action
+    if action == "setup":
+        return vm.cmd_setup(args)
+    elif action == "status":
+        return vm.cmd_status(args)
+    elif action == "start":
+        return vm.cmd_start(args)
+    elif action == "stop":
+        return vm.cmd_stop(args)
+    elif action == "deallocate":
+        return vm.cmd_deallocate(args)
+    elif action == "delete":
+        return vm.cmd_delete(args)
+    elif action == "probe":
+        return vm.cmd_probe(args)
+    elif action == "logs":
+        return vm.cmd_logs(args)
+    elif action == "diag":
+        return vm.cmd_diag(args)
+    elif action == "ssh":
+        return vm.cmd_ssh(args)
+    elif action == "vnc":
+        return vm.cmd_vnc(args)
+    elif action == "exec":
+        return vm.cmd_exec(args)
+    elif action == "monitor":
+        return vm.cmd_monitor(args)
+
+    print(f"Unknown VM action: {action}")
+    return 1
+
+
+def _cmd_mock(args: argparse.Namespace) -> int:
+    """Run mock evaluation."""
+    # Delegate to existing CLI implementation
+    from openadapt_evals.benchmarks.cli import cmd_mock
+    return cmd_mock(args)
+
+
+def _cmd_run(args: argparse.Namespace) -> int:
+    """Run live evaluation."""
+    from openadapt_evals.benchmarks.cli import cmd_live
+    return cmd_live(args)
+
+
+def _cmd_probe(args: argparse.Namespace) -> int:
+    """Probe WAA server."""
+    from openadapt_evals.benchmarks.cli import cmd_probe
+    return cmd_probe(args)
+
+
+def _cmd_view(args: argparse.Namespace) -> int:
+    """Generate results viewer."""
+    from openadapt_evals.benchmarks.cli import cmd_view
+    return cmd_view(args)
+
+
+def _cmd_tasks(args: argparse.Namespace) -> int:
+    """List available tasks."""
+    from openadapt_evals.benchmarks.cli import cmd_tasks
+    return cmd_tasks(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/openadapt_evals/cli/vm.py b/openadapt_evals/cli/vm.py
new file mode 100644
index 0000000..e66306d
--- /dev/null
+++ b/openadapt_evals/cli/vm.py
@@ -0,0 +1,413 @@
+"""VM management commands for oa evals.
+
+This module provides Azure VM lifecycle commands:
+- setup: Create and configure VM with WAA
+- status: Show VM status
+- start/stop/deallocate: Control VM state
+- delete: Remove VM and resources
+- probe: Check WAA server status
+- logs: View container logs
+- diag: Diagnostic info
+- ssh/vnc: Interactive access
+- exec: Run commands
+- monitor: Start monitoring dashboard
+
+Usage:
+    oa evals vm setup           # Full setup
+    oa evals vm status          # Check status
+    oa evals vm probe           # Check WAA server
+    oa evals vm logs            # View logs
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import subprocess
+import sys
+
+logger = logging.getLogger(__name__)
+
+# Default VM configuration
+DEFAULT_VM_NAME = "waa-eval-vm"
+DEFAULT_RESOURCE_GROUP = "openadapt-agents"
+DEFAULT_VM_SIZE = "Standard_D8ds_v5"
+DEFAULT_LOCATION = "eastus"
+
+
+def _run_az(cmd: list[str], capture: bool = True) -> subprocess.CompletedProcess:
+    """Run Azure CLI command."""
+    full_cmd = ["az"] + cmd
+    logger.debug(f"Running: {' '.join(full_cmd)}")
+    return subprocess.run(
+        full_cmd,
+        capture_output=capture,
+        text=True,
+    )
+
+
+def _get_vm_ip(vm_name: str = DEFAULT_VM_NAME, resource_group: str = DEFAULT_RESOURCE_GROUP) -> str | None:
+    """Get VM public IP address."""
+    result = _run_az([
+        "vm", "show",
+        "-n", vm_name,
+        "-g", resource_group,
+        "-d",
+        "--query", "publicIps",
+        "-o", "tsv",
+    ])
+    if result.returncode == 0 and result.stdout.strip():
+        return result.stdout.strip()
+    return None
+
+
+def _get_vm_status(vm_name: str = DEFAULT_VM_NAME, resource_group: str = DEFAULT_RESOURCE_GROUP) -> dict | None:
+    """Get VM status."""
+    result = _run_az([
+        "vm", "show",
+        "-n", vm_name,
+        "-g", resource_group,
+        "-d",
+        "-o", "json",
+    ])
+    if result.returncode == 0:
+        return json.loads(result.stdout)
+    return None
+
+
+def cmd_setup(args: argparse.Namespace) -> int:
+    """Full VM setup with WAA."""
+    vm_name = getattr(args, "vm_name", DEFAULT_VM_NAME)
+    resource_group = getattr(args, "resource_group", DEFAULT_RESOURCE_GROUP)
+    vm_size = getattr(args, "vm_size", DEFAULT_VM_SIZE)
+    location = getattr(args, "location", DEFAULT_LOCATION)
+
+    print(f"Setting up Azure VM '{vm_name}' with WAA...")
+    print(f"  Resource group: {resource_group}")
+    print(f"  VM size: {vm_size}")
+    print(f"  Location: {location}")
+
+    # Check if VM already exists
+    status = _get_vm_status(vm_name, resource_group)
+    if status:
+        power_state = status.get("powerState", "unknown")
+        print(f"  VM already exists (state: {power_state})")
+        if power_state != "VM running":
+            print("  Starting VM...")
+            _run_az(["vm", "start", "-n", vm_name, "-g", resource_group])
+        return 0
+
+    # Create VM
+    print("  Creating VM (this may take a few minutes)...")
+    result = _run_az([
+        "vm", "create",
+        "-n", vm_name,
+        "-g", resource_group,
+        "--image", "Ubuntu2204",
+        "--size", vm_size,
+        "--location", location,
+        "--admin-username", "azureuser",
+        "--generate-ssh-keys",
+        "--public-ip-sku", "Standard",
+    ])
+
+    if result.returncode != 0:
+        print(f"ERROR: Failed to create VM: {result.stderr}")
+        return 1
+
+    print("  VM created successfully!")
+
+    # Get IP
+    ip = _get_vm_ip(vm_name, resource_group)
+    if ip:
+        print(f"  Public IP: {ip}")
+
+    # Install Docker and setup WAA
+    print("  Installing Docker and WAA (this may take 10-15 minutes)...")
+    setup_script = """
+set -e
+sudo apt-get update
+sudo apt-get install -y docker.io
+sudo systemctl start docker
+sudo systemctl enable docker
+sudo usermod -aG docker $USER
+sudo docker pull windowsarena/winarena:latest
+echo "WAA image pulled successfully"
+"""
+
+    result = _run_az([
+        "vm", "run-command", "invoke",
+        "-n", vm_name,
+        "-g", resource_group,
+        "--command-id", "RunShellScript",
+        "--scripts", setup_script,
+    ])
+
+    if result.returncode != 0:
+        print(f"WARNING: Setup script may have failed: {result.stderr}")
+    else:
+        print("  Docker and WAA installed!")
+
+    print("\nSetup complete! Next steps:")
+    print(f"  oa evals vm status          # Check VM status")
+    print(f"  oa evals vm probe --wait    # Wait for WAA server")
+    print(f"  oa evals run --agent gpt-4o # Run evaluation")
+
+    return 0
+
+
+def cmd_status(args: argparse.Namespace) -> int:
+    """Show VM status."""
+    status = _get_vm_status()
+    if not status:
+        print("VM not found or not accessible")
+        return 1
+
+    print(f"VM: {status.get('name', 'unknown')}")
+    print(f"  State: {status.get('powerState', 'unknown')}")
+    print(f"  Size: {status.get('hardwareProfile', {}).get('vmSize', 'unknown')}")
+    print(f"  Location: {status.get('location', 'unknown')}")
+
+    ip = _get_vm_ip()
+    if ip:
+        print(f"  Public IP: {ip}")
+
+    return 0
+
+
+def cmd_start(args: argparse.Namespace) -> int:
+    """Start deallocated VM."""
+    print("Starting VM...")
+    result = _run_az(["vm", "start", "-n", DEFAULT_VM_NAME, "-g", DEFAULT_RESOURCE_GROUP])
+    if result.returncode != 0:
+        print(f"ERROR: {result.stderr}")
+        return 1
+    print("VM started!")
+    return cmd_status(args)
+
+
+def cmd_stop(args: argparse.Namespace) -> int:
+    """Stop VM."""
+    print("Stopping VM...")
+    result = _run_az(["vm", "stop", "-n", DEFAULT_VM_NAME, "-g", DEFAULT_RESOURCE_GROUP])
+    if result.returncode != 0:
+        print(f"ERROR: {result.stderr}")
+        return 1
+    print("VM stopped!")
+    return 0
+
+
+def cmd_deallocate(args: argparse.Namespace) -> int:
+    """Deallocate VM (stops billing)."""
+    print("Deallocating VM (this stops billing)...")
+    result = _run_az(["vm", "deallocate", "-n", DEFAULT_VM_NAME, "-g", DEFAULT_RESOURCE_GROUP])
+    if result.returncode != 0:
+        print(f"ERROR: {result.stderr}")
+        return 1
+    print("VM deallocated! Billing stopped.")
+    return 0
+
+
+def cmd_delete(args: argparse.Namespace) -> int:
+    """Delete VM and resources."""
+    if not getattr(args, "yes", False):
+        print(f"This will DELETE VM '{DEFAULT_VM_NAME}' and all associated resources.")
+        response = input("Are you sure? (y/N): ")
+        if response.lower() != "y":
+            print("Cancelled.")
+            return 0
+
+    print("Deleting VM...")
+    result = _run_az([
+        "vm", "delete",
+        "-n", DEFAULT_VM_NAME,
+        "-g", DEFAULT_RESOURCE_GROUP,
+        "--yes",
+        "--force-deletion", "true",
+    ])
+    if result.returncode != 0:
+        print(f"ERROR: {result.stderr}")
+        return 1
+    print("VM deleted!")
+    return 0
+
+
+def cmd_probe(args: argparse.Namespace) -> int:
+    """Check WAA server status."""
+    ip = _get_vm_ip()
+    if not ip:
+        print("VM not found or no public IP")
+        return 1
+
+    import urllib.request
+    import urllib.error
+
+    url = f"http://{ip}:5000/probe"
+    wait = getattr(args, "wait", False)
+    timeout = getattr(args, "timeout", 300)
+
+    if wait:
+        print(f"Waiting for WAA server at {url}...")
+        import time
+        start = time.time()
+        while time.time() - start < timeout:
+            try:
+                response = urllib.request.urlopen(url, timeout=5)
+                data = json.loads(response.read())
+                print(f"WAA server ready! Status: {data}")
+                return 0
+            except (urllib.error.URLError, json.JSONDecodeError):
+                print(".", end="", flush=True)
+                time.sleep(5)
+        print(f"\nTimeout after {timeout}s")
+        return 1
+
+    try:
+        response = urllib.request.urlopen(url, timeout=10)
+        data = json.loads(response.read())
+        print(f"WAA server status: {data}")
+        return 0
+    except urllib.error.URLError as e:
+        print(f"WAA server not reachable: {e}")
+        return 1
+
+
+def cmd_logs(args: argparse.Namespace) -> int:
+    """View container logs."""
+    lines = getattr(args, "lines", 100)
+    follow = getattr(args, "follow", False)
+
+    cmd = f"docker logs winarena --tail {lines}"
+    if follow:
+        cmd += " -f"
+
+    ip = _get_vm_ip()
+    if not ip:
+        print("VM not found")
+        return 1
+
+    print(f"Fetching logs from {ip}...")
+    result = subprocess.run(
+        ["ssh", "-o", "StrictHostKeyChecking=no", f"azureuser@{ip}", cmd],
+        capture_output=not follow,
+        text=True,
+    )
+
+    if not follow:
+        print(result.stdout)
+        if result.stderr:
+            print(result.stderr, file=sys.stderr)
+
+    return result.returncode
+
+
+def cmd_diag(args: argparse.Namespace) -> int:
+    """Show VM diagnostic info."""
+    ip = _get_vm_ip()
+    if not ip:
+        print("VM not found")
+        return 1
+
+    print(f"Running diagnostics on {ip}...")
+
+    diag_cmd = """
+echo "=== Disk Usage ==="
+df -h /mnt /var/lib/docker 2>/dev/null || df -h
+echo ""
+echo "=== Docker Status ==="
+docker ps -a
+echo ""
+echo "=== Docker Images ==="
+docker images
+echo ""
+echo "=== Memory ==="
+free -h
+"""
+
+    result = subprocess.run(
+        ["ssh", "-o", "StrictHostKeyChecking=no", f"azureuser@{ip}", diag_cmd],
+        capture_output=True,
+        text=True,
+    )
+
+    print(result.stdout)
+    if result.stderr:
+        print(result.stderr, file=sys.stderr)
+
+    return result.returncode
+
+
+def cmd_ssh(args: argparse.Namespace) -> int:
+    """Open SSH session to VM."""
+    ip = _get_vm_ip()
+    if not ip:
+        print("VM not found")
+        return 1
+
+    print(f"Connecting to {ip}...")
+    return subprocess.call(["ssh", "-o", "StrictHostKeyChecking=no", f"azureuser@{ip}"])
+
+
+def cmd_vnc(args: argparse.Namespace) -> int:
+    """Open VNC viewer."""
+    ip = _get_vm_ip()
+    if not ip:
+        print("VM not found")
+        return 1
+
+    # Start SSH tunnel for VNC
+    print(f"Starting SSH tunnel to {ip}:8006...")
+    print("VNC will be available at http://localhost:8006")
+    print("Press Ctrl+C to stop the tunnel")
+
+    try:
+        subprocess.call([
+            "ssh", "-o", "StrictHostKeyChecking=no",
+            "-L", "8006:localhost:8006",
+            f"azureuser@{ip}",
+            "-N",
+        ])
+    except KeyboardInterrupt:
+        print("\nTunnel closed.")
+
+    return 0
+
+
+def cmd_exec(args: argparse.Namespace) -> int:
+    """Run command on VM."""
+    cmd = getattr(args, "cmd", None)
+    if not cmd:
+        print("ERROR: --cmd is required")
+        return 1
+
+    ip = _get_vm_ip()
+    if not ip:
+        print("VM not found")
+        return 1
+
+    result = subprocess.run(
+        ["ssh", "-o", "StrictHostKeyChecking=no", f"azureuser@{ip}", cmd],
+        capture_output=True,
+        text=True,
+    )
+
+    print(result.stdout)
+    if result.stderr:
+        print(result.stderr, file=sys.stderr)
+
+    return result.returncode
+
+
+def cmd_monitor(args: argparse.Namespace) -> int:
+    """Start monitoring dashboard."""
+    print("Starting monitoring dashboard...")
+    print("This feature requires the full infrastructure setup.")
+    print("For now, use individual commands:")
+    print("  oa evals vm status   # Check VM status")
+    print("  oa evals vm probe    # Check WAA server")
+    print("  oa evals vm logs     # View logs")
+    print("  oa evals vm vnc      # Open VNC viewer")
+
+    # Show quick status
+    return cmd_status(args)
diff --git a/openadapt_evals/evaluation/__init__.py b/openadapt_evals/evaluation/__init__.py
new file mode 100644
index 0000000..debde6d
--- /dev/null
+++ b/openadapt_evals/evaluation/__init__.py
@@ -0,0 +1,30 @@
+"""Client-side evaluation module for WAA benchmarks.
+
+This module provides client-side evaluation without requiring a sidecar service.
+The evaluators run locally, making HTTP calls to the WAA server's /execute endpoint.
+
+Example usage:
+    from openadapt_evals.evaluation import EvaluatorClient, discover_vm_ip
+
+    # Auto-detect VM IP
+    vm_ip = discover_vm_ip()
+
+    # Or create client with auto-detection
+    client = EvaluatorClient()  # Auto-detects IP
+    client = EvaluatorClient(vm_ip="20.127.64.200")  # Explicit IP
+
+    # Evaluate a task
+    result = client.evaluate(task_config)
+    print(f"Success: {result.success}, Score: {result.score}")
+"""
+
+from .discovery import VMIPDiscovery, DiscoveryMethod, discover_vm_ip
+from .client import EvaluatorClient, EvaluationResult
+
+__all__ = [
+    "VMIPDiscovery",
+    "DiscoveryMethod",
+    "discover_vm_ip",
+    "EvaluatorClient",
+    "EvaluationResult",
+]
diff --git a/openadapt_evals/evaluation/client.py b/openadapt_evals/evaluation/client.py
new file mode 100644
index 0000000..28dedad
--- /dev/null
+++ b/openadapt_evals/evaluation/client.py
@@ -0,0 +1,307 @@
+"""Client-side evaluator for WAA benchmarks.
+
+Runs WAA evaluators locally, making HTTP calls to the WAA server's /execute endpoint.
+This approach follows WAA's own design pattern and eliminates the need for a sidecar service.
+"""
+
+import sys
+import json
+import requests
+from pathlib import Path
+from typing import Any, Dict, Optional
+from dataclasses import dataclass, field
+
+
+@dataclass
+class EvaluationResult:
+    """Result of evaluating a benchmark task."""
+    success: bool
+    score: float
+    actual: Any = None
+    expected: Any = None
+    reason: str = ""
+    metrics: Dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "success": self.success,
+            "score": self.score,
+            "actual": str(self.actual)[:500] if self.actual else None,
+            "expected": str(self.expected)[:500] if self.expected else None,
+            "reason": self.reason,
+            "metrics": self.metrics,
+        }
+
+
+class EvaluatorClient:
+    """Client-side evaluator that uses WAA's evaluators directly.
+
+    This client imports WAA's evaluator modules (getters, metrics) and runs them
+    locally. The getters make HTTP calls to the WAA server's /execute endpoint
+    to retrieve values from the Windows VM.
+
+    Example:
+        client = EvaluatorClient()  # Auto-detects VM IP
+        result = client.evaluate(task_config)
+    """
+
+    def __init__(
+        self,
+        vm_ip: Optional[str] = None,
+        port: int = 5000,
+        waa_evaluators_path: Optional[Path] = None,
+        timeout: int = 30,
+    ):
+        """Initialize the evaluator client.
+
+        Args:
+            vm_ip: VM IP address. If None, auto-detects from multiple sources.
+            port: WAA server port (default 5000).
+            waa_evaluators_path: Path to WAA evaluators. If None, searches common locations.
+            timeout: HTTP request timeout in seconds.
+        """
+        from .discovery import discover_vm_ip
+
+        self.vm_ip = vm_ip or discover_vm_ip()
+        if not self.vm_ip:
+            raise ValueError(
+                "Could not auto-detect VM IP. Please provide vm_ip explicitly or "
+                "set WAA_VM_IP environment variable."
+            )
+
+        self.port = port
+        self.timeout = timeout
+        self.base_url = f"http://{self.vm_ip}:{self.port}"
+
+        # Find and load WAA evaluators
+        self._evaluators_path = waa_evaluators_path or self._find_evaluators_path()
+        self._getters = None
+        self._metrics = None
+        self._load_evaluators()
+
+    def _find_evaluators_path(self) -> Optional[Path]:
+        """Find WAA evaluators in common locations."""
+        search_paths = [
+            # Relative to openadapt-ml
+            Path(__file__).parent.parent.parent.parent / "openadapt-ml" / "vendor" / "WindowsAgentArena" / "src" / "win-arena-container" / "client" / "desktop_env",
+            # Relative to current file in openadapt-evals
+            Path(__file__).parent.parent.parent.parent / "vendor" / "WindowsAgentArena" / "src" / "win-arena-container" / "client" / "desktop_env",
+            # Absolute common locations
+            Path.home() / "WindowsAgentArena" / "src" / "win-arena-container" / "client" / "desktop_env",
+            Path("/opt/waa/client/desktop_env"),
+        ]
+
+        for path in search_paths:
+            evaluators_dir = path / "evaluators"
+            if evaluators_dir.exists() and (evaluators_dir / "getters.py").exists():
+                return path
+
+        return None
+
+    def _load_evaluators(self) -> None:
+        """Load WAA evaluator modules."""
+        if not self._evaluators_path:
+            return
+
+        # Add to sys.path if not already there
+        path_str = str(self._evaluators_path)
+        if path_str not in sys.path:
+            sys.path.insert(0, path_str)
+
+        # Also add parent for absolute imports
+        parent_str = str(self._evaluators_path.parent)
+        if parent_str not in sys.path:
+            sys.path.insert(0, parent_str)
+
+        try:
+            from evaluators import getters, metrics
+            self._getters = getters
+            self._metrics = metrics
+        except ImportError as e:
+            # Evaluators not available, will use fallback
+            pass
+
+    def evaluate(self, task_config: Dict[str, Any]) -> EvaluationResult:
+        """Evaluate a benchmark task.
+
+        Args:
+            task_config: Task configuration with 'evaluator' section containing:
+                - result: Dict with 'type' specifying the getter function
+                - expected: Dict with 'value' or 'rules' specifying expected result
+                - func: Metric function name (default: 'exact_match')
+
+        Returns:
+            EvaluationResult with success status, score, and details.
+        """
+        evaluator_config = task_config.get("evaluator", {})
+
+        if not evaluator_config:
+            return EvaluationResult(
+                success=False,
+                score=0.0,
+                reason="No evaluator configuration in task"
+            )
+
+        try:
+            # Get actual value from VM
+            actual = self._get_actual_value(evaluator_config)
+
+            # Get expected value from config
+            expected = self._get_expected_value(evaluator_config)
+
+            # Run metric comparison
+            score = self._run_metric(evaluator_config, actual, expected)
+
+            return EvaluationResult(
+                success=score >= 1.0,
+                score=score,
+                actual=actual,
+                expected=expected,
+                reason=f"Metric returned score {score}",
+                metrics={"raw_score": score}
+            )
+
+        except Exception as e:
+            return EvaluationResult(
+                success=False,
+                score=0.0,
+                reason=f"Evaluation error: {str(e)}"
+            )
+
+    def _get_actual_value(self, evaluator_config: Dict[str, Any]) -> Any:
+        """Get actual value from VM using getter function."""
+        result_spec = evaluator_config.get("result", {})
+        getter_type = result_spec.get("type")
+
+        if not getter_type:
+            raise ValueError("No 'type' specified in evaluator.result")
+
+        # Create a mock env object that the getters expect
+        class HttpEnv:
+            def __init__(self, vm_ip: str, port: int, timeout: int):
+                self.vm_ip = vm_ip
+                self.port = port
+                self.timeout = timeout
+
+            def execute(self, command: str) -> Dict[str, Any]:
+                """Execute command on VM via HTTP."""
+                url = f"http://{self.vm_ip}:{self.port}/execute"
+                try:
+                    response = requests.post(
+                        url,
+                        json={"command": command},
+                        timeout=self.timeout
+                    )
+                    response.raise_for_status()
+                    return response.json()
+                except requests.RequestException as e:
+                    return {"error": str(e), "output": ""}
+
+        env = HttpEnv(self.vm_ip, self.port, self.timeout)
+
+        # Try WAA getter if available
+        if self._getters:
+            getter_func = getattr(self._getters, f"get_{getter_type}", None)
+            if getter_func:
+                return getter_func(env, result_spec)
+
+        # Fallback: direct HTTP call
+        return self._fallback_getter(env, getter_type, result_spec)
+
+    def _fallback_getter(self, env: Any, getter_type: str, spec: Dict[str, Any]) -> Any:
+        """Fallback getter implementation when WAA evaluators not available."""
+        # Common getter types
+        if getter_type == "file_content":
+            path = spec.get("path", "")
+            result = env.execute(f"type {path}")
+            return result.get("output", "")
+
+        elif getter_type == "registry_value":
+            key = spec.get("key", "")
+            value = spec.get("value", "")
+            result = env.execute(f'reg query "{key}" /v "{value}"')
+            return result.get("output", "")
+
+        elif getter_type == "process_running":
+            process = spec.get("process", "")
+            result = env.execute(f'tasklist /FI "IMAGENAME eq {process}"')
+            return process.lower() in result.get("output", "").lower()
+
+        elif getter_type == "window_exists":
+            title = spec.get("title", "")
+            result = env.execute(f'powershell "Get-Process | Where-Object {{$_.MainWindowTitle -like \'*{title}*\'}}"')
+            return bool(result.get("output", "").strip())
+
+        else:
+            raise ValueError(f"Unknown getter type: {getter_type}")
+
+    def _get_expected_value(self, evaluator_config: Dict[str, Any]) -> Any:
+        """Extract expected value from evaluator config."""
+        expected_spec = evaluator_config.get("expected", {})
+
+        # Direct value
+        if "value" in expected_spec:
+            return expected_spec["value"]
+
+        # Rules-based
+        rules = expected_spec.get("rules", {})
+        if "match" in rules:
+            return rules["match"]
+
+        return None
+
+    def _run_metric(self, evaluator_config: Dict[str, Any], actual: Any, expected: Any) -> float:
+        """Run metric function to compare actual vs expected."""
+        func_name = evaluator_config.get("func", "exact_match")
+
+        # Try WAA metric if available
+        if self._metrics:
+            metric_func = getattr(self._metrics, func_name, None)
+            if metric_func:
+                try:
+                    return float(metric_func(actual, expected))
+                except Exception:
+                    pass
+
+        # Fallback metrics
+        return self._fallback_metric(func_name, actual, expected)
+
+    def _fallback_metric(self, func_name: str, actual: Any, expected: Any) -> float:
+        """Fallback metric implementations."""
+        if func_name == "exact_match":
+            return 1.0 if actual == expected else 0.0
+
+        elif func_name == "contains":
+            if isinstance(actual, str) and isinstance(expected, str):
+                return 1.0 if expected.lower() in actual.lower() else 0.0
+            return 0.0
+
+        elif func_name == "fuzzy_match":
+            if isinstance(actual, str) and isinstance(expected, str):
+                # Simple fuzzy: check if most words match
+                actual_words = set(actual.lower().split())
+                expected_words = set(expected.lower().split())
+                if not expected_words:
+                    return 0.0
+                overlap = len(actual_words & expected_words)
+                return overlap / len(expected_words)
+            return 0.0
+
+        elif func_name == "boolean":
+            return 1.0 if bool(actual) == bool(expected) else 0.0
+
+        else:
+            # Unknown metric, default to exact match
+            return 1.0 if actual == expected else 0.0
+
+    def health_check(self) -> bool:
+        """Check if WAA server is reachable."""
+        try:
+            response = requests.get(
+                f"{self.base_url}/probe",
+                timeout=5
+            )
+            return response.status_code == 200
+        except requests.RequestException:
+            return False
diff --git a/openadapt_evals/evaluation/discovery.py b/openadapt_evals/evaluation/discovery.py
new file mode 100644
index 0000000..9014e6c
--- /dev/null
+++ b/openadapt_evals/evaluation/discovery.py
@@ -0,0 +1,220 @@
+"""VM IP auto-discovery for WAA evaluation.
+
+Provides multiple methods to discover the WAA VM IP address without
+requiring manual configuration.
+"""
+
+import os
+import json
+import subprocess
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass
+
+
+class DiscoveryMethod(Enum):
+    """Methods for discovering VM IP address."""
+    EXPLICIT = "explicit"           # Passed directly
+    ENV_VAR = "env_var"             # From environment variable
+    SSH_TUNNEL = "ssh_tunnel"       # localhost when tunnel active
+    DOCKER = "docker"               # From Docker network inspection
+    SESSION_FILE = "session_file"   # From session tracker file
+    AZURE_STATUS = "azure_status"   # From azure_ops_status.json
+    PROBE = "probe"                 # Scan common IPs
+
+
+@dataclass
+class DiscoveryResult:
+    """Result of IP discovery attempt."""
+    ip: Optional[str]
+    method: DiscoveryMethod
+    confidence: float  # 0.0 to 1.0
+    details: str
+
+
+class VMIPDiscovery:
+    """Discovers WAA VM IP address from multiple sources."""
+
+    def __init__(self):
+        self._cached_ip: Optional[str] = None
+        self._cached_method: Optional[DiscoveryMethod] = None
+
+    def discover(self, explicit_ip: Optional[str] = None) -> DiscoveryResult:
+        """Discover VM IP using multiple methods in priority order.
+
+        Args:
+            explicit_ip: If provided, use this IP directly.
+
+        Returns:
+            DiscoveryResult with the discovered IP and method used.
+        """
+        # Priority 1: Explicit IP provided
+        if explicit_ip:
+            return DiscoveryResult(
+                ip=explicit_ip,
+                method=DiscoveryMethod.EXPLICIT,
+                confidence=1.0,
+                details="IP provided explicitly"
+            )
+
+        # Priority 2: Environment variable
+        result = self._try_env_var()
+        if result.ip:
+            return result
+
+        # Priority 3: Session tracker file
+        result = self._try_session_file()
+        if result.ip:
+            return result
+
+        # Priority 4: Azure ops status file
+        result = self._try_azure_status()
+        if result.ip:
+            return result
+
+        # Priority 5: SSH tunnel (localhost)
+        result = self._try_ssh_tunnel()
+        if result.ip:
+            return result
+
+        # Priority 6: Docker network
+        result = self._try_docker()
+        if result.ip:
+            return result
+
+        # No IP found
+        return DiscoveryResult(
+            ip=None,
+            method=DiscoveryMethod.PROBE,
+            confidence=0.0,
+            details="No VM IP could be discovered from any source"
+        )
+
+    def _try_env_var(self) -> DiscoveryResult:
+        """Try to get IP from environment variables."""
+        for var in ["WAA_VM_IP", "VM_IP", "AZURE_VM_IP"]:
+            ip = os.environ.get(var)
+            if ip:
+                return DiscoveryResult(
+                    ip=ip,
+                    method=DiscoveryMethod.ENV_VAR,
+                    confidence=0.95,
+                    details=f"From environment variable {var}"
+                )
+        return DiscoveryResult(None, DiscoveryMethod.ENV_VAR, 0.0, "No env var set")
+
+    def _try_session_file(self) -> DiscoveryResult:
+        """Try to get IP from session tracker file."""
+        session_paths = [
+            Path.home() / ".openadapt" / "vm_session.json",
+            Path("benchmark_results/vm_session.json"),
+            Path("/tmp/vm_session.json"),
+        ]
+
+        for path in session_paths:
+            try:
+                if path.exists():
+                    data = json.loads(path.read_text())
+                    ip = data.get("vm_ip")
+                    if ip:
+                        return DiscoveryResult(
+                            ip=ip,
+                            method=DiscoveryMethod.SESSION_FILE,
+                            confidence=0.9,
+                            details=f"From session file: {path}"
+                        )
+            except (json.JSONDecodeError, IOError):
+                continue
+
+        return DiscoveryResult(None, DiscoveryMethod.SESSION_FILE, 0.0, "No session file found")
+
+    def _try_azure_status(self) -> DiscoveryResult:
+        """Try to get IP from azure_ops_status.json."""
+        status_paths = [
+            Path("benchmark_results/azure_ops_status.json"),
+            Path("training_output/current/azure_ops_status.json"),
+        ]
+
+        for path in status_paths:
+            try:
+                if path.exists():
+                    data = json.loads(path.read_text())
+                    ip = data.get("vm_ip")
+                    if ip:
+                        return DiscoveryResult(
+                            ip=ip,
+                            method=DiscoveryMethod.AZURE_STATUS,
+                            confidence=0.85,
+                            details=f"From status file: {path}"
+                        )
+            except (json.JSONDecodeError, IOError):
+                continue
+
+        return DiscoveryResult(None, DiscoveryMethod.AZURE_STATUS, 0.0, "No status file found")
+
+    def _try_ssh_tunnel(self) -> DiscoveryResult:
+        """Check if SSH tunnel is active (localhost:5000 reachable)."""
+        import socket
+
+        try:
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.settimeout(1)
+            result = sock.connect_ex(('localhost', 5000))
+            sock.close()
+
+            if result == 0:
+                return DiscoveryResult(
+                    ip="localhost",
+                    method=DiscoveryMethod.SSH_TUNNEL,
+                    confidence=0.95,
+                    details="SSH tunnel detected on localhost:5000"
+                )
+        except socket.error:
+            pass
+
+        return DiscoveryResult(None, DiscoveryMethod.SSH_TUNNEL, 0.0, "No SSH tunnel detected")
+
+    def _try_docker(self) -> DiscoveryResult:
+        """Try to get IP from Docker network inspection."""
+        try:
+            # Check if we're inside a Docker container
+            result = subprocess.run(
+                ["docker", "inspect", "winarena", "--format", "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}"],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            if result.returncode == 0 and result.stdout.strip():
+                ip = result.stdout.strip()
+                return DiscoveryResult(
+                    ip=ip,
+                    method=DiscoveryMethod.DOCKER,
+                    confidence=0.9,
+                    details=f"From Docker container inspection"
+                )
+        except (subprocess.TimeoutExpired, FileNotFoundError):
+            pass
+
+        # Check for standard Docker bridge IP
+        docker_ip = "172.30.0.2"  # Standard WAA container IP
+        return DiscoveryResult(
+            ip=docker_ip,
+            method=DiscoveryMethod.DOCKER,
+            confidence=0.5,
+            details="Using default Docker bridge IP (unverified)"
+        )
+
+
+def discover_vm_ip(explicit_ip: Optional[str] = None) -> Optional[str]:
+    """Convenience function to discover VM IP.
+
+    Args:
+        explicit_ip: If provided, returns this IP directly.
+
+    Returns:
+        Discovered VM IP or None if not found.
+    """
+    discovery = VMIPDiscovery()
+    result = discovery.discover(explicit_ip)
+    return result.ip
diff --git a/openadapt_evals/infrastructure/__init__.py b/openadapt_evals/infrastructure/__init__.py
new file mode 100644
index 0000000..9815495
--- /dev/null
+++ b/openadapt_evals/infrastructure/__init__.py
@@ -0,0 +1,32 @@
+"""Infrastructure components for VM management and monitoring.
+
+This module provides:
+- VMMonitor: Azure VM status monitoring
+- AzureOpsTracker: Azure operation logging
+- SSHTunnelManager: SSH tunnel management for VNC/API access
+
+Example:
+    ```python
+    from openadapt_evals.infrastructure import VMMonitor, SSHTunnelManager
+
+    # Monitor VM status
+    monitor = VMMonitor()
+    status = monitor.get_status()
+
+    # Manage SSH tunnels
+    tunnel_manager = SSHTunnelManager()
+    tunnel_manager.start_tunnels_for_vm("172.171.112.41", "azureuser")
+    ```
+"""
+
+from openadapt_evals.infrastructure.vm_monitor import VMMonitor, VMConfig
+from openadapt_evals.infrastructure.azure_ops_tracker import AzureOpsTracker
+from openadapt_evals.infrastructure.ssh_tunnel import SSHTunnelManager, get_tunnel_manager
+
+__all__ = [
+    "VMMonitor",
+    "VMConfig",
+    "AzureOpsTracker",
+    "SSHTunnelManager",
+    "get_tunnel_manager",
+]
diff --git a/openadapt_evals/infrastructure/azure_ops_tracker.py b/openadapt_evals/infrastructure/azure_ops_tracker.py
new file mode 100644
index 0000000..87bea0f
--- /dev/null
+++ b/openadapt_evals/infrastructure/azure_ops_tracker.py
@@ -0,0 +1,521 @@
+"""Azure operations status tracker.
+
+Writes real-time status to azure_ops_status.json for dashboard consumption.
+Used by CLI commands (setup-waa, run-waa, vm monitor) to provide visibility
+into long-running Azure operations.
+
+Usage:
+    from openadapt_evals.infrastructure.azure_ops_tracker import AzureOpsTracker
+
+    tracker = AzureOpsTracker()
+    tracker.start_operation("docker_build", total_steps=12)
+    tracker.update(phase="pulling_base_image", step=1, log_lines=["Pulling from ..."])
+    tracker.append_log("Step 1/12 : FROM dockurr/windows:latest")
+    tracker.finish_operation()
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass, asdict, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+# VM pricing from vm_monitor.py
+VM_HOURLY_RATES = {
+    "Standard_D2_v3": 0.096,
+    "Standard_D4_v3": 0.192,
+    "Standard_D8_v3": 0.384,
+    "Standard_D4s_v3": 0.192,
+    "Standard_D8s_v3": 0.384,
+    "Standard_D4ds_v5": 0.422,  # Updated pricing as per spec
+    "Standard_D8ds_v5": 0.384,
+    "Standard_D16ds_v5": 0.768,
+    "Standard_D32ds_v5": 1.536,
+}
+
+# Typical operation durations in seconds (for ETA estimation)
+TYPICAL_DURATIONS = {
+    "docker_build": 600,  # ~10 minutes for waa-auto build
+    "docker_pull": 300,  # ~5 minutes for large image pull
+    "windows_boot": 900,  # ~15 minutes for first Windows boot
+    "benchmark": 1800,  # ~30 minutes for 20 tasks
+}
+
+DEFAULT_OUTPUT_FILE = Path("benchmark_results/azure_ops_status.json")
+
+
+@dataclass
+class AzureOpsStatus:
+    """Status of current Azure operation.
+
+    Attributes:
+        operation: Current operation type (idle, vm_create, docker_install,
+            docker_build, windows_boot, benchmark, etc.)
+        phase: Specific phase within the operation.
+        step: Current step number.
+        total_steps: Total number of steps in the operation.
+        progress_pct: Progress percentage (0-100).
+        log_tail: Last N lines of log output.
+        started_at: ISO timestamp when operation started.
+        elapsed_seconds: Seconds since operation started.
+        eta_seconds: Estimated seconds remaining (None if unknown).
+        cost_usd: Running cost in USD.
+        hourly_rate_usd: Hourly VM rate in USD.
+        vm_ip: VM IP address if available.
+        vm_state: VM power state (running, starting, stopped, deallocated).
+        vm_size: Azure VM size.
+        vnc_url: VNC URL for accessing Windows desktop.
+        error: Error message if operation failed.
+        download_bytes: Bytes downloaded so far (for image pulls).
+        download_total_bytes: Total bytes to download.
+        build_id: Current Docker build run ID (to detect new builds).
+    """
+
+    operation: str = "idle"
+    phase: str = ""
+    step: int = 0
+    total_steps: int = 0
+    progress_pct: float = 0.0
+    log_tail: list[str] = field(default_factory=list)
+    started_at: str | None = None
+    elapsed_seconds: float = 0.0
+    eta_seconds: float | None = None
+    cost_usd: float = 0.0
+    hourly_rate_usd: float = 0.422  # Default for Standard_D4ds_v5
+    vm_ip: str | None = None
+    vm_state: str = "unknown"
+    vm_size: str = "Standard_D4ds_v5"
+    vnc_url: str | None = None
+    error: str | None = None
+    download_bytes: int = 0
+    download_total_bytes: int = 0
+    build_id: str | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return asdict(self)
+
+
+class AzureOpsTracker:
+    """Tracks Azure operations and writes status to JSON file.
+
+    The tracker maintains a status file that the dashboard can poll to
+    display real-time progress of Azure operations.
+    """
+
+    MAX_LOG_LINES = 100
+
+    def __init__(
+        self,
+        output_file: str | Path = DEFAULT_OUTPUT_FILE,
+        vm_size: str = "Standard_D4ds_v5",
+    ):
+        """Initialize tracker.
+
+        Args:
+            output_file: Path to output JSON file.
+            vm_size: Azure VM size for cost calculation.
+        """
+        self.output_file = Path(output_file)
+        self.vm_size = vm_size
+        self.hourly_rate = VM_HOURLY_RATES.get(vm_size, 0.422)
+        self._status = AzureOpsStatus(
+            vm_size=vm_size,
+            hourly_rate_usd=self.hourly_rate,
+        )
+        self._start_time: datetime | None = None
+
+    def start_operation(
+        self,
+        operation: str,
+        total_steps: int = 0,
+        phase: str = "",
+        vm_ip: str | None = None,
+        vm_state: str = "running",
+        build_id: str | None = None,
+        started_at: datetime | None = None,
+    ) -> None:
+        """Start tracking a new operation.
+
+        Args:
+            operation: Operation type (vm_create, docker_install, docker_build,
+                windows_boot, benchmark, etc.)
+            total_steps: Total number of steps in the operation.
+            phase: Initial phase description.
+            vm_ip: VM IP address if known.
+            vm_state: VM power state.
+            build_id: Unique identifier for this build (to detect new builds).
+            started_at: When the operation actually started (uses now if not provided).
+        """
+        self._start_time = started_at or datetime.now()
+        self._status = AzureOpsStatus(
+            operation=operation,
+            phase=phase,
+            step=0,
+            total_steps=total_steps,
+            progress_pct=0.0,
+            log_tail=[],  # Clear stale logs
+            started_at=self._start_time.isoformat(),
+            elapsed_seconds=0.0,
+            eta_seconds=TYPICAL_DURATIONS.get(
+                operation
+            ),  # Use typical duration as initial ETA
+            cost_usd=0.0,
+            hourly_rate_usd=self.hourly_rate,
+            vm_ip=vm_ip,
+            vm_state=vm_state,
+            vm_size=self.vm_size,
+            vnc_url="http://localhost:8006" if vm_ip else None,
+            error=None,
+            download_bytes=0,
+            download_total_bytes=0,
+            build_id=build_id,
+        )
+        self._write_status()
+
+    def update(
+        self,
+        phase: str | None = None,
+        step: int | None = None,
+        total_steps: int | None = None,
+        log_lines: list[str] | None = None,
+        vm_ip: str | None = None,
+        vm_state: str | None = None,
+        error: str | None = None,
+        download_bytes: int | None = None,
+        download_total_bytes: int | None = None,
+        build_id: str | None = None,
+    ) -> None:
+        """Update operation status.
+
+        Args:
+            phase: Current phase description.
+            step: Current step number.
+            total_steps: Total steps (can be updated if discovered during operation).
+            log_lines: New log lines to append.
+            vm_ip: VM IP address.
+            vm_state: VM power state.
+            error: Error message if operation failed.
+            download_bytes: Bytes downloaded so far.
+            download_total_bytes: Total bytes to download.
+            build_id: Build identifier (clears log if different from current).
+        """
+        # If build_id changed, this is a new build - clear stale logs
+        if build_id is not None and build_id != self._status.build_id:
+            self._status.build_id = build_id
+            self._status.log_tail = []
+            self._status.error = None
+            self._start_time = datetime.now()
+            self._status.started_at = self._start_time.isoformat()
+
+        if phase is not None:
+            self._status.phase = phase
+        if step is not None:
+            self._status.step = step
+        if total_steps is not None:
+            self._status.total_steps = total_steps
+        if log_lines is not None:
+            for line in log_lines:
+                self.append_log(line)
+        if vm_ip is not None:
+            self._status.vm_ip = vm_ip
+            self._status.vnc_url = "http://localhost:8006"
+        if vm_state is not None:
+            self._status.vm_state = vm_state
+        if error is not None:
+            self._status.error = error
+        if download_bytes is not None:
+            self._status.download_bytes = download_bytes
+        if download_total_bytes is not None:
+            self._status.download_total_bytes = download_total_bytes
+
+        # Update derived fields
+        self._update_progress()
+        self._write_status()
+
+    def append_log(self, line: str) -> None:
+        """Append a log line (keeps last MAX_LOG_LINES).
+
+        Args:
+            line: Log line to append.
+        """
+        self._status.log_tail.append(line.rstrip())
+        if len(self._status.log_tail) > self.MAX_LOG_LINES:
+            self._status.log_tail = self._status.log_tail[-self.MAX_LOG_LINES :]
+        self._update_progress()
+        self._write_status()
+
+    def parse_docker_build_line(self, line: str) -> dict[str, Any]:
+        """Parse Docker build output for step progress and download info.
+
+        Handles both patterns:
+        - Old style: "Step X/Y : ..."
+        - Buildx style: "#N [stage X/Y] ..." or "#N sha256:... XXXMB / YGB ..."
+
+        Args:
+            line: Docker build output line.
+
+        Returns:
+            Dict with parsed info: {step, total_steps, download_bytes, download_total_bytes, phase}
+        """
+        result: dict[str, Any] = {}
+
+        # Old style: "Step X/Y : ..."
+        step_match = re.search(r"Step\s+(\d+)/(\d+)", line)
+        if step_match:
+            result["step"] = int(step_match.group(1))
+            result["total_steps"] = int(step_match.group(2))
+
+        # Buildx style: "#N [stage X/Y] ..."
+        buildx_stage = re.search(r"#\d+\s+\[.*?\s+(\d+)/(\d+)\]", line)
+        if buildx_stage:
+            result["step"] = int(buildx_stage.group(1))
+            result["total_steps"] = int(buildx_stage.group(2))
+
+        # Download progress: "sha256:... XXXMB / YGB ..." or "XXX.XXMB / YY.YYGB ..."
+        download_match = re.search(
+            r"(\d+(?:\.\d+)?)\s*(MB|GB|KB|B)\s*/\s*(\d+(?:\.\d+)?)\s*(MB|GB|KB|B)",
+            line,
+        )
+        if download_match:
+            size_multipliers = {"B": 1, "KB": 1024, "MB": 1024**2, "GB": 1024**3}
+            downloaded = float(download_match.group(1))
+            downloaded_unit = download_match.group(2)
+            total = float(download_match.group(3))
+            total_unit = download_match.group(4)
+            result["download_bytes"] = int(
+                downloaded * size_multipliers[downloaded_unit]
+            )
+            result["download_total_bytes"] = int(total * size_multipliers[total_unit])
+
+        # Extract phase from buildx output
+        if line.startswith("#"):
+            # #N DONE, #N CACHED, #N [stage]
+            phase_match = re.match(r"#\d+\s+(.*)", line)
+            if phase_match:
+                phase_text = phase_match.group(1)[:80]
+                # Clean up ANSI codes
+                phase_text = re.sub(r"\x1b\[[0-9;]*m", "", phase_text)
+                result["phase"] = phase_text.strip()
+
+        # Apply updates if we found anything
+        if "step" in result:
+            self._status.step = result["step"]
+        if "total_steps" in result:
+            self._status.total_steps = result["total_steps"]
+        if "download_bytes" in result:
+            self._status.download_bytes = result["download_bytes"]
+        if "download_total_bytes" in result:
+            self._status.download_total_bytes = result["download_total_bytes"]
+        if "phase" in result:
+            self._status.phase = result["phase"]
+
+        if result:
+            self._update_progress()
+
+        return result
+
+    def is_error_line(self, line: str) -> bool:
+        """Check if a line is an error message.
+
+        Args:
+            line: Log line to check.
+
+        Returns:
+            True if line contains an error.
+        """
+        error_patterns = [
+            r"ERROR:",
+            r"failed to build",
+            r"failed to solve",
+            r"error reading from server",
+            r"rpc error",
+        ]
+        return any(re.search(p, line, re.IGNORECASE) for p in error_patterns)
+
+    def finish_operation(self, success: bool = True, error: str | None = None) -> None:
+        """Mark operation as complete.
+
+        Args:
+            success: Whether the operation completed successfully.
+            error: Error message if operation failed.
+        """
+        if error:
+            self._status.error = error
+        self._status.operation = "complete" if success else "failed"
+        self._status.progress_pct = 100.0 if success else self._status.progress_pct
+        self._update_progress()
+        self._write_status()
+
+    def set_idle(self) -> None:
+        """Reset tracker to idle state."""
+        self._start_time = None
+        self._status = AzureOpsStatus(
+            vm_size=self.vm_size,
+            hourly_rate_usd=self.hourly_rate,
+        )
+        self._write_status()
+
+    def get_status(self) -> AzureOpsStatus:
+        """Get current status (with updated elapsed time and cost)."""
+        self._update_progress()
+        return self._status
+
+    def _update_progress(self) -> None:
+        """Update derived fields (elapsed time, cost, progress percentage, ETA)."""
+        # Update elapsed time
+        if self._start_time:
+            elapsed = datetime.now() - self._start_time
+            self._status.elapsed_seconds = elapsed.total_seconds()
+
+            # Update cost
+            elapsed_hours = self._status.elapsed_seconds / 3600
+            self._status.cost_usd = elapsed_hours * self.hourly_rate
+
+        # Calculate progress from multiple sources
+        progress_pct = 0.0
+        eta_seconds = None
+
+        # 1. Download progress (most accurate during image pulls)
+        if self._status.download_total_bytes > 0:
+            download_pct = (
+                self._status.download_bytes / self._status.download_total_bytes
+            ) * 100
+            progress_pct = max(progress_pct, download_pct)
+
+            # ETA from download speed
+            if self._status.download_bytes > 0 and self._status.elapsed_seconds > 1:
+                bytes_per_sec = (
+                    self._status.download_bytes / self._status.elapsed_seconds
+                )
+                remaining_bytes = (
+                    self._status.download_total_bytes - self._status.download_bytes
+                )
+                if bytes_per_sec > 0:
+                    eta_seconds = remaining_bytes / bytes_per_sec
+
+        # 2. Step-based progress
+        if self._status.total_steps > 0:
+            step_pct = (self._status.step / self._status.total_steps) * 100
+            progress_pct = max(progress_pct, step_pct)
+
+            # ETA from step rate (only if we have meaningful progress)
+            if self._status.step > 0 and self._status.elapsed_seconds > 10:
+                time_per_step = self._status.elapsed_seconds / self._status.step
+                remaining_steps = self._status.total_steps - self._status.step
+                step_eta = time_per_step * remaining_steps
+                # Use step ETA if we don't have download ETA or if step progress > download
+                if (
+                    eta_seconds is None
+                    or step_pct
+                    > (
+                        self._status.download_bytes
+                        / max(self._status.download_total_bytes, 1)
+                    )
+                    * 100
+                ):
+                    eta_seconds = step_eta
+
+        # 3. Fallback: Use typical duration if no progress info
+        if eta_seconds is None and self._status.operation in TYPICAL_DURATIONS:
+            typical = TYPICAL_DURATIONS[self._status.operation]
+            remaining = max(0, typical - self._status.elapsed_seconds)
+            eta_seconds = remaining
+            # Estimate progress from elapsed vs typical
+            if progress_pct == 0 and self._status.elapsed_seconds > 0:
+                progress_pct = min(95, (self._status.elapsed_seconds / typical) * 100)
+
+        self._status.progress_pct = min(100.0, progress_pct)
+        self._status.eta_seconds = eta_seconds
+
+    def _write_status(self) -> None:
+        """Write current status to JSON file."""
+        self.output_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.output_file, "w") as f:
+            json.dump(self._status.to_dict(), f, indent=2)
+
+
+# Global tracker instance for convenience
+_tracker: AzureOpsTracker | None = None
+
+
+def get_tracker(
+    output_file: str | Path = DEFAULT_OUTPUT_FILE,
+    vm_size: str = "Standard_D4ds_v5",
+) -> AzureOpsTracker:
+    """Get or create global tracker instance.
+
+    Args:
+        output_file: Path to output JSON file.
+        vm_size: Azure VM size for cost calculation.
+
+    Returns:
+        AzureOpsTracker instance.
+    """
+    global _tracker
+    if _tracker is None:
+        _tracker = AzureOpsTracker(output_file=output_file, vm_size=vm_size)
+    return _tracker
+
+
+def read_status(
+    status_file: str | Path = DEFAULT_OUTPUT_FILE,
+) -> dict[str, Any]:
+    """Read status from JSON file with fresh computed values.
+
+    This function reads the persisted status and recomputes time-dependent
+    fields (elapsed_seconds, cost_usd) based on the current time. This ensures
+    the API always returns accurate values without relying on client-side
+    computation.
+
+    Args:
+        status_file: Path to status JSON file.
+
+    Returns:
+        Status dictionary with fresh elapsed_seconds and cost_usd, or idle status
+        if file doesn't exist.
+    """
+    status_path = Path(status_file)
+    if status_path.exists():
+        try:
+            with open(status_path) as f:
+                status = json.load(f)
+
+            # Recompute time-dependent fields if operation is active
+            if status.get("started_at") and status.get("operation") not in (
+                "idle",
+                "complete",
+                "failed",
+            ):
+                started_at = datetime.fromisoformat(status["started_at"])
+                elapsed = datetime.now() - started_at
+                elapsed_seconds = max(0, elapsed.total_seconds())
+
+                # Update elapsed time
+                status["elapsed_seconds"] = elapsed_seconds
+
+                # Update cost based on elapsed time
+                hourly_rate = status.get("hourly_rate_usd", 0.422)
+                status["cost_usd"] = (elapsed_seconds / 3600) * hourly_rate
+
+                # Update ETA if we have progress info
+                progress_pct = status.get("progress_pct", 0)
+                if progress_pct > 0 and elapsed_seconds > 10:
+                    # Estimate remaining time from progress rate
+                    time_per_pct = elapsed_seconds / progress_pct
+                    remaining_pct = 100 - progress_pct
+                    status["eta_seconds"] = time_per_pct * remaining_pct
+                elif status.get("operation") in TYPICAL_DURATIONS:
+                    # Use typical duration minus elapsed
+                    typical = TYPICAL_DURATIONS[status["operation"]]
+                    status["eta_seconds"] = max(0, typical - elapsed_seconds)
+
+            return status
+        except (json.JSONDecodeError, IOError, ValueError):
+            pass
+
+    # Return default idle status
+    return AzureOpsStatus().to_dict()
diff --git a/openadapt_evals/infrastructure/ssh_tunnel.py b/openadapt_evals/infrastructure/ssh_tunnel.py
new file mode 100644
index 0000000..7a08fd6
--- /dev/null
+++ b/openadapt_evals/infrastructure/ssh_tunnel.py
@@ -0,0 +1,595 @@
+"""SSH Tunnel Manager for Azure VMs.
+
+This module provides automatic SSH tunnel management for accessing services
+running inside Azure VMs (VNC, WAA server) that are not exposed via NSG.
+
+Architecture:
+    Azure VMs have Network Security Groups (NSGs) that act as firewalls.
+    By default, only port 22 (SSH) is open. To access other services like
+    VNC (8006) and WAA (5000), we create SSH tunnels:
+
+    Browser → localhost:8006 → SSH Tunnel → Azure VM:8006 → Docker → noVNC
+
+    This is more secure than opening ports in NSG because:
+    1. All traffic is encrypted through SSH
+    2. No authentication bypass (VNC has no auth by default)
+    3. Access requires SSH key authentication
+
+Usage:
+    from openadapt_evals.infrastructure.ssh_tunnel import SSHTunnelManager
+
+    # Create manager
+    manager = SSHTunnelManager()
+
+    # Start tunnels for a VM
+    manager.start_tunnels_for_vm(
+        vm_ip="172.171.112.41",
+        ssh_user="azureuser",
+        ports={"vnc": 8006, "waa": 5000}
+    )
+
+    # Check tunnel status
+    status = manager.get_tunnel_status()
+    # {'vnc': {'active': True, 'local_port': 8006, 'remote': '172.171.112.41:8006'}, ...}
+
+    # Stop all tunnels
+    manager.stop_all_tunnels()
+
+Integration:
+    The SSHTunnelManager is integrated with the dashboard server (local.py):
+    - When a VM's WAA probe becomes "ready", tunnels are auto-started
+    - When VM goes offline, tunnels are auto-stopped
+    - Dashboard shows tunnel status next to VNC button
+    - VNC button links to localhost:port (tunnel endpoint)
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import signal
+import socket
+import subprocess
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TunnelConfig:
+    """Configuration for a single SSH tunnel."""
+
+    name: str  # e.g., "vnc", "waa"
+    local_port: int  # Local port to listen on
+    remote_port: int  # Port on the remote VM
+    remote_host: str = "localhost"  # Host on remote side (usually localhost)
+
+
+@dataclass
+class TunnelStatus:
+    """Status of an SSH tunnel."""
+
+    name: str
+    active: bool
+    local_port: int
+    remote_endpoint: str  # e.g., "172.171.112.41:8006"
+    pid: int | None = None
+    error: str | None = None
+
+
+class SSHTunnelManager:
+    """Manages SSH tunnels for Azure VM access.
+
+    Provides automatic setup and teardown of SSH tunnels for services
+    running inside Azure VMs that are not exposed via NSG.
+
+    Features:
+    - Auto-reconnect: Automatically restarts dead tunnels
+    - Health monitoring: Periodic checks to verify tunnels are working
+    - Graceful handling of network interruptions
+
+    Attributes:
+        tunnels: Dict of tunnel name -> (TunnelConfig, process)
+        ssh_key_path: Path to SSH private key
+    """
+
+    # Default tunnel configurations
+    # Note: WAA uses local_port=5001 to avoid conflicts with any local WAA server on 5000
+    # The remote port is still 5000 (where WAA Flask runs inside Windows)
+    DEFAULT_TUNNELS = [
+        TunnelConfig(name="vnc", local_port=8006, remote_port=8006),
+        TunnelConfig(name="waa", local_port=5001, remote_port=5000),
+    ]
+
+    # Auto-reconnect settings
+    MAX_RECONNECT_ATTEMPTS = 3
+    RECONNECT_DELAY_SECONDS = 2
+
+    def __init__(
+        self,
+        ssh_key_path: str | Path | None = None,
+        tunnels: list[TunnelConfig] | None = None,
+        auto_reconnect: bool = True,
+    ):
+        """Initialize tunnel manager.
+
+        Args:
+            ssh_key_path: Path to SSH private key. Defaults to ~/.ssh/id_rsa.
+            tunnels: List of tunnel configurations. Defaults to VNC + WAA.
+            auto_reconnect: If True, automatically restart dead tunnels.
+        """
+        self.ssh_key_path = Path(ssh_key_path or Path.home() / ".ssh" / "id_rsa")
+        self.tunnel_configs = tunnels or self.DEFAULT_TUNNELS
+        self._active_tunnels: dict[str, tuple[TunnelConfig, subprocess.Popen]] = {}
+        self._current_vm_ip: str | None = None
+        self._current_ssh_user: str | None = None
+        self._auto_reconnect = auto_reconnect
+        self._reconnect_attempts: dict[
+            str, int
+        ] = {}  # Track reconnect attempts per tunnel
+
+    def start_tunnels_for_vm(
+        self,
+        vm_ip: str,
+        ssh_user: str = "azureuser",
+        tunnels: list[TunnelConfig] | None = None,
+    ) -> dict[str, TunnelStatus]:
+        """Start SSH tunnels for a VM.
+
+        Args:
+            vm_ip: IP address of the Azure VM.
+            ssh_user: SSH username (default: azureuser).
+            tunnels: Optional list of tunnels to start. Defaults to all configured tunnels.
+
+        Returns:
+            Dict of tunnel name -> TunnelStatus.
+        """
+        self._current_vm_ip = vm_ip
+        self._current_ssh_user = ssh_user
+
+        tunnels_to_start = tunnels or self.tunnel_configs
+        results = {}
+
+        for config in tunnels_to_start:
+            status = self._start_tunnel(config, vm_ip, ssh_user)
+            results[config.name] = status
+
+        return results
+
+    def _start_tunnel(
+        self,
+        config: TunnelConfig,
+        vm_ip: str,
+        ssh_user: str,
+    ) -> TunnelStatus:
+        """Start a single SSH tunnel.
+
+        Args:
+            config: Tunnel configuration.
+            vm_ip: IP address of the Azure VM.
+            ssh_user: SSH username.
+
+        Returns:
+            TunnelStatus indicating success or failure.
+        """
+        # Check if tunnel already active
+        if config.name in self._active_tunnels:
+            proc = self._active_tunnels[config.name][1]
+            if proc.poll() is None:  # Still running
+                logger.debug(f"Tunnel {config.name} already active")
+                return TunnelStatus(
+                    name=config.name,
+                    active=True,
+                    local_port=config.local_port,
+                    remote_endpoint=f"{vm_ip}:{config.remote_port}",
+                    pid=proc.pid,
+                )
+
+        # Check if local port is already in use
+        if self._is_port_in_use(config.local_port):
+            # Port in use - check if it's an existing SSH tunnel (likely created manually)
+            # If we can reach the service through it, consider it active
+            if self._check_tunnel_works(config.local_port, config.remote_port):
+                logger.info(f"Port {config.local_port} has existing working tunnel")
+                return TunnelStatus(
+                    name=config.name,
+                    active=True,
+                    local_port=config.local_port,
+                    remote_endpoint=f"{vm_ip}:{config.remote_port}",
+                    pid=None,  # We don't know the PID of the external tunnel
+                )
+            else:
+                logger.warning(
+                    f"Port {config.local_port} already in use by unknown process"
+                )
+                return TunnelStatus(
+                    name=config.name,
+                    active=False,
+                    local_port=config.local_port,
+                    remote_endpoint=f"{vm_ip}:{config.remote_port}",
+                    error=f"Port {config.local_port} in use by another process",
+                )
+
+        # Build SSH command with keepalive settings to prevent timeout during long runs
+        # ServerAliveInterval=60: Send keepalive every 60 seconds
+        # ServerAliveCountMax=10: Disconnect after 10 missed keepalives (10 min tolerance)
+        # TCPKeepAlive=yes: Enable TCP-level keepalive as additional safeguard
+        ssh_cmd = [
+            "ssh",
+            "-o",
+            "StrictHostKeyChecking=no",
+            "-o",
+            "UserKnownHostsFile=/dev/null",
+            "-o",
+            "LogLevel=ERROR",
+            "-o",
+            "ServerAliveInterval=60",
+            "-o",
+            "ServerAliveCountMax=10",
+            "-o",
+            "TCPKeepAlive=yes",
+            "-o",
+            "ExitOnForwardFailure=yes",
+            "-i",
+            str(self.ssh_key_path),
+            "-N",  # Don't execute remote command
+            "-L",
+            f"{config.local_port}:{config.remote_host}:{config.remote_port}",
+            f"{ssh_user}@{vm_ip}",
+        ]
+
+        try:
+            # Start SSH tunnel in background
+            proc = subprocess.Popen(
+                ssh_cmd,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.PIPE,
+                start_new_session=True,  # Detach from terminal
+            )
+
+            # Wait briefly to check if it started successfully
+            time.sleep(0.5)
+
+            if proc.poll() is not None:
+                # Process exited, get error
+                _, stderr = proc.communicate(timeout=1)
+                error_msg = stderr.decode().strip() if stderr else "Unknown error"
+                logger.error(f"Tunnel {config.name} failed: {error_msg}")
+                return TunnelStatus(
+                    name=config.name,
+                    active=False,
+                    local_port=config.local_port,
+                    remote_endpoint=f"{vm_ip}:{config.remote_port}",
+                    error=error_msg[:200],
+                )
+
+            # Tunnel started successfully
+            self._active_tunnels[config.name] = (config, proc)
+            logger.info(
+                f"Started tunnel {config.name}: localhost:{config.local_port} -> {vm_ip}:{config.remote_port}"
+            )
+
+            return TunnelStatus(
+                name=config.name,
+                active=True,
+                local_port=config.local_port,
+                remote_endpoint=f"{vm_ip}:{config.remote_port}",
+                pid=proc.pid,
+            )
+
+        except Exception as e:
+            logger.error(f"Failed to start tunnel {config.name}: {e}")
+            return TunnelStatus(
+                name=config.name,
+                active=False,
+                local_port=config.local_port,
+                remote_endpoint=f"{vm_ip}:{config.remote_port}",
+                error=str(e)[:200],
+            )
+
+    def stop_tunnel(self, name: str) -> bool:
+        """Stop a specific tunnel by name.
+
+        Args:
+            name: Tunnel name (e.g., "vnc", "waa").
+
+        Returns:
+            True if tunnel was stopped, False if not found.
+        """
+        if name not in self._active_tunnels:
+            return False
+
+        config, proc = self._active_tunnels[name]
+
+        try:
+            # Send SIGTERM to gracefully stop
+            os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
+            proc.wait(timeout=5)
+        except ProcessLookupError:
+            pass  # Already dead
+        except subprocess.TimeoutExpired:
+            # Force kill
+            try:
+                os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+
+        del self._active_tunnels[name]
+        logger.info(f"Stopped tunnel {name}")
+        return True
+
+    def stop_all_tunnels(self) -> None:
+        """Stop all active tunnels."""
+        for name in list(self._active_tunnels.keys()):
+            self.stop_tunnel(name)
+        self._current_vm_ip = None
+        self._current_ssh_user = None
+
+    def get_tunnel_status(self, auto_restart: bool = True) -> dict[str, TunnelStatus]:
+        """Get status of all configured tunnels.
+
+        This method checks the actual port status, not just internal state.
+        This correctly reports tunnels as active even if they were started
+        by a different process or if the tunnel manager was restarted.
+
+        If auto_reconnect is enabled and a tunnel is found dead, this method
+        will attempt to restart it automatically.
+
+        Args:
+            auto_restart: If True and auto_reconnect is enabled, restart dead tunnels.
+
+        Returns:
+            Dict of tunnel name -> TunnelStatus.
+        """
+        results = {}
+        tunnels_to_restart = []
+
+        for config in self.tunnel_configs:
+            if config.name in self._active_tunnels:
+                _, proc = self._active_tunnels[config.name]
+                if proc.poll() is None:  # Still running
+                    # Reset reconnect attempts on successful check
+                    self._reconnect_attempts[config.name] = 0
+                    results[config.name] = TunnelStatus(
+                        name=config.name,
+                        active=True,
+                        local_port=config.local_port,
+                        remote_endpoint=f"{self._current_vm_ip}:{config.remote_port}"
+                        if self._current_vm_ip
+                        else "unknown",
+                        pid=proc.pid,
+                    )
+                else:
+                    # Process died - but check if port is still working
+                    # (could be another tunnel on the same port)
+                    del self._active_tunnels[config.name]
+                    if self._is_port_in_use(
+                        config.local_port
+                    ) and self._check_tunnel_works(
+                        config.local_port, config.remote_port
+                    ):
+                        results[config.name] = TunnelStatus(
+                            name=config.name,
+                            active=True,
+                            local_port=config.local_port,
+                            remote_endpoint=f"{self._current_vm_ip}:{config.remote_port}"
+                            if self._current_vm_ip
+                            else "external",
+                            pid=None,  # External tunnel, PID unknown
+                        )
+                    else:
+                        # Tunnel is dead - mark for restart if auto_reconnect enabled
+                        if (
+                            self._auto_reconnect
+                            and auto_restart
+                            and self._current_vm_ip
+                        ):
+                            tunnels_to_restart.append(config)
+                        results[config.name] = TunnelStatus(
+                            name=config.name,
+                            active=False,
+                            local_port=config.local_port,
+                            remote_endpoint="",
+                            error="Tunnel process exited",
+                        )
+            else:
+                # Not tracked internally - but check if an external tunnel exists
+                # This handles tunnels started by other processes or after manager restart
+                if self._is_port_in_use(config.local_port) and self._check_tunnel_works(
+                    config.local_port, config.remote_port
+                ):
+                    logger.debug(
+                        f"Found working external tunnel on port {config.local_port}"
+                    )
+                    results[config.name] = TunnelStatus(
+                        name=config.name,
+                        active=True,
+                        local_port=config.local_port,
+                        remote_endpoint=f"{self._current_vm_ip}:{config.remote_port}"
+                        if self._current_vm_ip
+                        else "external",
+                        pid=None,  # External tunnel, PID unknown
+                    )
+                else:
+                    results[config.name] = TunnelStatus(
+                        name=config.name,
+                        active=False,
+                        local_port=config.local_port,
+                        remote_endpoint="",
+                    )
+
+        # Auto-restart dead tunnels
+        for config in tunnels_to_restart:
+            attempts = self._reconnect_attempts.get(config.name, 0)
+            if attempts < self.MAX_RECONNECT_ATTEMPTS:
+                logger.info(
+                    f"Auto-reconnecting tunnel {config.name} (attempt {attempts + 1}/{self.MAX_RECONNECT_ATTEMPTS})"
+                )
+                time.sleep(self.RECONNECT_DELAY_SECONDS)
+                self._reconnect_attempts[config.name] = attempts + 1
+                status = self._start_tunnel(
+                    config, self._current_vm_ip, self._current_ssh_user or "azureuser"
+                )
+                results[config.name] = status
+                if status.active:
+                    logger.info(f"Successfully reconnected tunnel {config.name}")
+                    self._reconnect_attempts[config.name] = 0  # Reset on success
+            else:
+                logger.warning(
+                    f"Tunnel {config.name} exceeded max reconnect attempts ({self.MAX_RECONNECT_ATTEMPTS})"
+                )
+                results[config.name] = TunnelStatus(
+                    name=config.name,
+                    active=False,
+                    local_port=config.local_port,
+                    remote_endpoint="",
+                    error=f"Max reconnect attempts ({self.MAX_RECONNECT_ATTEMPTS}) exceeded",
+                )
+
+        return results
+
+    def is_tunnel_active(self, name: str) -> bool:
+        """Check if a specific tunnel is active.
+
+        Args:
+            name: Tunnel name.
+
+        Returns:
+            True if tunnel is active.
+        """
+        status = self.get_tunnel_status()
+        return name in status and status[name].active
+
+    def reset_reconnect_attempts(self, name: str | None = None) -> None:
+        """Reset reconnect attempt counter for tunnels.
+
+        Call this after manually fixing connectivity issues or when
+        VM is known to be healthy again.
+
+        Args:
+            name: Tunnel name to reset, or None to reset all.
+        """
+        if name:
+            self._reconnect_attempts[name] = 0
+        else:
+            self._reconnect_attempts.clear()
+        logger.info(f"Reset reconnect attempts for {name or 'all tunnels'}")
+
+    def ensure_tunnels_for_vm(
+        self,
+        vm_ip: str,
+        ssh_user: str = "azureuser",
+    ) -> dict[str, TunnelStatus]:
+        """Ensure tunnels are running for a VM, starting if needed.
+
+        This is idempotent - safe to call repeatedly.
+
+        Args:
+            vm_ip: IP address of the Azure VM.
+            ssh_user: SSH username.
+
+        Returns:
+            Dict of tunnel name -> TunnelStatus.
+        """
+        # If VM changed, stop old tunnels and reset reconnect attempts
+        if self._current_vm_ip and self._current_vm_ip != vm_ip:
+            logger.info(
+                f"VM IP changed from {self._current_vm_ip} to {vm_ip}, restarting tunnels"
+            )
+            self.stop_all_tunnels()
+            self.reset_reconnect_attempts()  # Fresh start for new VM
+
+        # Check current status and start any missing tunnels
+        # get_tunnel_status will auto-restart dead tunnels if enabled
+        current_status = self.get_tunnel_status()
+        all_active = all(s.active for s in current_status.values())
+
+        if all_active and self._current_vm_ip == vm_ip:
+            return current_status
+
+        # Start tunnels (also resets reconnect attempts for this VM)
+        self.reset_reconnect_attempts()
+        return self.start_tunnels_for_vm(vm_ip, ssh_user)
+
+    def _is_port_in_use(self, port: int) -> bool:
+        """Check if a local port is in use.
+
+        Args:
+            port: Port number.
+
+        Returns:
+            True if port is in use.
+        """
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            try:
+                s.bind(("localhost", port))
+                return False
+            except OSError:
+                return True
+
+    def _check_tunnel_works(self, local_port: int, remote_port: int) -> bool:
+        """Check if an existing tunnel on a port is actually working.
+
+        For VNC (8006), check if we get HTTP response from noVNC.
+        For WAA (5000), check if /probe endpoint responds.
+
+        Args:
+            local_port: Local port to check.
+            remote_port: Remote port (used to determine service type).
+
+        Returns:
+            True if tunnel appears to be working.
+        """
+        import urllib.request
+        import urllib.error
+
+        try:
+            if remote_port == 5000:
+                # WAA server - check /probe endpoint
+                req = urllib.request.Request(
+                    f"http://localhost:{local_port}/probe",
+                    method="GET",
+                )
+                with urllib.request.urlopen(req, timeout=3) as resp:
+                    return resp.status == 200
+            elif remote_port == 8006:
+                # VNC - check if noVNC responds
+                req = urllib.request.Request(
+                    f"http://localhost:{local_port}/",
+                    method="GET",
+                )
+                with urllib.request.urlopen(req, timeout=3) as resp:
+                    return resp.status == 200
+            else:
+                # Unknown service - try to connect
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    s.settimeout(3)
+                    s.connect(("localhost", local_port))
+                    return True
+        except (urllib.error.URLError, socket.error, OSError):
+            return False
+
+    def __del__(self):
+        """Clean up tunnels on destruction."""
+        try:
+            self.stop_all_tunnels()
+        except Exception:
+            pass
+
+
+# Global tunnel manager instance
+_tunnel_manager: SSHTunnelManager | None = None
+
+
+def get_tunnel_manager() -> SSHTunnelManager:
+    """Get the global tunnel manager instance.
+
+    Returns:
+        SSHTunnelManager instance.
+    """
+    global _tunnel_manager
+    if _tunnel_manager is None:
+        _tunnel_manager = SSHTunnelManager()
+    return _tunnel_manager
diff --git a/openadapt_evals/infrastructure/vm_monitor.py b/openadapt_evals/infrastructure/vm_monitor.py
new file mode 100644
index 0000000..c10560d
--- /dev/null
+++ b/openadapt_evals/infrastructure/vm_monitor.py
@@ -0,0 +1,1111 @@
+"""VM monitoring utilities for WAA benchmark evaluation.
+
+This module provides reusable classes for monitoring Windows VMs running WAA.
+Can be used by the viewer, CLI, or as a standalone tool.
+
+Enhanced with Azure ML job tracking, cost estimation, and activity detection.
+
+Usage:
+    # Monitor a single VM
+    from openadapt_evals.infrastructure.vm_monitor import VMMonitor, VMConfig
+
+    config = VMConfig(
+        name="azure-waa-vm",
+        ssh_host="172.171.112.41",
+        ssh_user="azureuser",
+        docker_container="winarena",
+        internal_ip="20.20.20.21",
+    )
+
+    monitor = VMMonitor(config)
+    status = monitor.check_status()
+    print(f"VNC: {status.vnc_reachable}, WAA: {status.waa_ready}")
+
+    # Or run continuous monitoring
+    monitor.run_monitor(callback=lambda s: print(s))
+
+    # Fetch Azure ML jobs
+    jobs = fetch_azure_ml_jobs(days=7)
+    print(f"Found {len(jobs)} jobs in last 7 days")
+
+    # Calculate VM costs
+    costs = calculate_vm_costs(vm_size="Standard_D4ds_v5", hours=2.5)
+    print(f"Estimated cost: ${costs['total_cost_usd']:.2f}")
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import time
+from dataclasses import dataclass, field, asdict
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Callable
+import urllib.request
+import urllib.error
+import socket
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class VMConfig:
+    """Configuration for a WAA VM."""
+
+    name: str
+    ssh_host: str
+    ssh_user: str = "azureuser"
+    vnc_port: int = 8006
+    waa_port: int = 5000
+    qmp_port: int = 7200
+    docker_container: str = "winarena"
+    internal_ip: str = "20.20.20.21"
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: dict) -> VMConfig:
+        """Create from dictionary."""
+        return cls(**data)
+
+
+@dataclass
+class VMStatus:
+    """Status of a WAA VM at a point in time."""
+
+    config: VMConfig
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+    ssh_reachable: bool = False
+    vnc_reachable: bool = False
+    waa_ready: bool = False
+    waa_probe_response: str | None = None
+    container_running: bool = False
+    container_logs: str | None = None
+    disk_usage_gb: float | None = None
+    error: str | None = None
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "config": self.config.to_dict(),
+            "timestamp": self.timestamp,
+            "ssh_reachable": self.ssh_reachable,
+            "vnc_reachable": self.vnc_reachable,
+            "waa_ready": self.waa_ready,
+            "waa_probe_response": self.waa_probe_response,
+            "container_running": self.container_running,
+            "container_logs": self.container_logs,
+            "disk_usage_gb": self.disk_usage_gb,
+            "error": self.error,
+        }
+
+
+class VMMonitor:
+    """Monitor a single WAA VM."""
+
+    def __init__(self, config: VMConfig, timeout: int = 5):
+        """Initialize monitor.
+
+        Args:
+            config: VM configuration.
+            timeout: Timeout in seconds for network operations.
+        """
+        self.config = config
+        self.timeout = timeout
+
+    def check_vnc(self) -> bool:
+        """Check if VNC port is reachable via SSH tunnel (localhost)."""
+        try:
+            # VNC is only accessible via SSH tunnel at localhost, not the public IP
+            url = f"http://localhost:{self.config.vnc_port}/"
+            req = urllib.request.Request(url, method="HEAD")
+            with urllib.request.urlopen(req, timeout=self.timeout):
+                return True
+        except (urllib.error.URLError, socket.timeout, Exception):
+            return False
+
+    def check_ssh(self) -> bool:
+        """Check if SSH is reachable."""
+        try:
+            result = subprocess.run(
+                [
+                    "ssh",
+                    "-o",
+                    "StrictHostKeyChecking=no",
+                    "-o",
+                    f"ConnectTimeout={self.timeout}",
+                    "-o",
+                    "BatchMode=yes",
+                    f"{self.config.ssh_user}@{self.config.ssh_host}",
+                    "echo ok",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=self.timeout + 5,
+            )
+            return result.returncode == 0 and "ok" in result.stdout
+        except (subprocess.TimeoutExpired, Exception):
+            return False
+
+    def check_waa_probe(self) -> tuple[bool, str | None]:
+        """Check if WAA /probe endpoint responds.
+
+        Returns:
+            Tuple of (ready, response_text).
+        """
+        try:
+            cmd = f"curl -s --connect-timeout {self.timeout} http://{self.config.internal_ip}:{self.config.waa_port}/probe"
+            result = subprocess.run(
+                [
+                    "ssh",
+                    "-o",
+                    "StrictHostKeyChecking=no",
+                    "-o",
+                    f"ConnectTimeout={self.timeout}",
+                    "-o",
+                    "BatchMode=yes",
+                    f"{self.config.ssh_user}@{self.config.ssh_host}",
+                    cmd,
+                ],
+                capture_output=True,
+                text=True,
+                timeout=self.timeout + 10,
+            )
+            response = result.stdout.strip()
+            if response and "error" not in response.lower():
+                return True, response
+            return False, response or None
+        except (subprocess.TimeoutExpired, Exception) as e:
+            return False, str(e)
+
+    def get_container_status(self) -> tuple[bool, str | None]:
+        """Check container status and get recent logs.
+
+        Returns:
+            Tuple of (running, last_log_lines).
+        """
+        try:
+            cmd = f"docker ps -q -f name={self.config.docker_container}"
+            result = subprocess.run(
+                [
+                    "ssh",
+                    "-o",
+                    "StrictHostKeyChecking=no",
+                    "-o",
+                    f"ConnectTimeout={self.timeout}",
+                    "-o",
+                    "BatchMode=yes",
+                    f"{self.config.ssh_user}@{self.config.ssh_host}",
+                    cmd,
+                ],
+                capture_output=True,
+                text=True,
+                timeout=self.timeout + 5,
+            )
+            running = bool(result.stdout.strip())
+
+            if running:
+                # Get last few log lines
+                log_cmd = f"docker logs {self.config.docker_container} 2>&1 | tail -5"
+                log_result = subprocess.run(
+                    [
+                        "ssh",
+                        "-o",
+                        "StrictHostKeyChecking=no",
+                        "-o",
+                        f"ConnectTimeout={self.timeout}",
+                        "-o",
+                        "BatchMode=yes",
+                        f"{self.config.ssh_user}@{self.config.ssh_host}",
+                        log_cmd,
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=self.timeout + 10,
+                )
+                return True, log_result.stdout.strip()
+            return False, None
+        except (subprocess.TimeoutExpired, Exception) as e:
+            return False, str(e)
+
+    def get_disk_usage(self) -> float | None:
+        """Get disk usage of data.img in GB."""
+        try:
+            # Try common paths
+            paths = [
+                "/home/azureuser/waa-storage/data.img",
+                "/home/ubuntu/waa-storage/data.img",
+                "/storage/data.img",
+            ]
+            for path in paths:
+                cmd = f"du -b {path} 2>/dev/null | cut -f1"
+                result = subprocess.run(
+                    [
+                        "ssh",
+                        "-o",
+                        "StrictHostKeyChecking=no",
+                        "-o",
+                        f"ConnectTimeout={self.timeout}",
+                        "-o",
+                        "BatchMode=yes",
+                        f"{self.config.ssh_user}@{self.config.ssh_host}",
+                        cmd,
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=self.timeout + 5,
+                )
+                if result.returncode == 0 and result.stdout.strip():
+                    try:
+                        bytes_size = int(result.stdout.strip())
+                        return round(bytes_size / (1024**3), 2)
+                    except ValueError:
+                        continue
+            return None
+        except (subprocess.TimeoutExpired, Exception):
+            return None
+
+    def check_status(self) -> VMStatus:
+        """Perform full status check on the VM.
+
+        Returns:
+            VMStatus with all checks performed.
+        """
+        status = VMStatus(config=self.config)
+
+        try:
+            # Check VNC first (fastest, no SSH needed)
+            status.vnc_reachable = self.check_vnc()
+
+            # Check SSH
+            status.ssh_reachable = self.check_ssh()
+
+            if status.ssh_reachable:
+                # Check container
+                status.container_running, status.container_logs = (
+                    self.get_container_status()
+                )
+
+                # Check WAA probe
+                status.waa_ready, status.waa_probe_response = self.check_waa_probe()
+
+                # Get disk usage
+                status.disk_usage_gb = self.get_disk_usage()
+        except Exception as e:
+            status.error = str(e)
+
+        return status
+
+    def run_monitor(
+        self,
+        callback: Callable[[VMStatus], None] | None = None,
+        interval: int = 30,
+        stop_on_ready: bool = True,
+        output_file: str | Path | None = None,
+    ) -> VMStatus:
+        """Run continuous monitoring until WAA is ready.
+
+        Args:
+            callback: Optional callback function called with each status update.
+            interval: Seconds between checks.
+            stop_on_ready: Stop monitoring when WAA is ready.
+            output_file: Optional file to write status updates (JSON lines).
+
+        Returns:
+            Final VMStatus (typically when WAA is ready).
+        """
+        output_path = Path(output_file) if output_file else None
+        if output_path:
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        while True:
+            status = self.check_status()
+
+            # Call callback if provided
+            if callback:
+                callback(status)
+
+            # Write to file if provided
+            if output_path:
+                with open(output_path, "a") as f:
+                    f.write(json.dumps(status.to_dict()) + "\n")
+
+            # Check if we should stop
+            if stop_on_ready and status.waa_ready:
+                return status
+
+            time.sleep(interval)
+
+
+@dataclass
+class PoolWorker:
+    """A single worker in a VM pool."""
+
+    name: str
+    ip: str
+    status: str = "creating"  # creating, ready, running, completed, failed, deleted
+    docker_container: str = "winarena"
+    waa_ready: bool = False
+    assigned_tasks: list[str] = field(default_factory=list)
+    completed_tasks: list[str] = field(default_factory=list)
+    current_task: str | None = None
+    error: str | None = None
+    created_at: str = field(default_factory=lambda: datetime.now().isoformat())
+    updated_at: str = field(default_factory=lambda: datetime.now().isoformat())
+
+
+@dataclass
+class VMPool:
+    """A pool of worker VMs for parallel WAA evaluation."""
+
+    pool_id: str
+    created_at: str
+    resource_group: str
+    location: str
+    vm_size: str
+    workers: list[PoolWorker]
+    total_tasks: int = 0
+    completed_tasks: int = 0
+    failed_tasks: int = 0
+
+
+class VMPoolRegistry:
+    """Manage VM pools for parallel WAA evaluation."""
+
+    REGISTRY_FILE = "benchmark_results/vm_pool_registry.json"
+
+    def __init__(self, registry_file: str | Path | None = None):
+        """Initialize pool registry.
+
+        Args:
+            registry_file: Path to JSON registry file.
+        """
+        self.registry_file = Path(registry_file or self.REGISTRY_FILE)
+        self._pool: VMPool | None = None
+        self.load()
+
+    def load(self) -> None:
+        """Load pool from registry file."""
+        if self.registry_file.exists():
+            try:
+                with open(self.registry_file) as f:
+                    data = json.load(f)
+                    workers = [PoolWorker(**w) for w in data.get("workers", [])]
+                    self._pool = VMPool(
+                        pool_id=data["pool_id"],
+                        created_at=data["created_at"],
+                        resource_group=data["resource_group"],
+                        location=data["location"],
+                        vm_size=data["vm_size"],
+                        workers=workers,
+                        total_tasks=data.get("total_tasks", 0),
+                        completed_tasks=data.get("completed_tasks", 0),
+                        failed_tasks=data.get("failed_tasks", 0),
+                    )
+            except (json.JSONDecodeError, KeyError) as e:
+                print(f"Warning: Could not load pool registry: {e}")
+                self._pool = None
+
+    def save(self) -> None:
+        """Save pool to registry file."""
+        if self._pool is None:
+            return
+        self.registry_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.registry_file, "w") as f:
+            json.dump(asdict(self._pool), f, indent=2)
+
+    def create_pool(
+        self,
+        workers: list[tuple[str, str]],  # [(name, ip), ...]
+        resource_group: str,
+        location: str,
+        vm_size: str = "Standard_D4ds_v5",
+    ) -> VMPool:
+        """Create a new pool from created VMs.
+
+        Args:
+            workers: List of (name, ip) tuples.
+            resource_group: Azure resource group.
+            location: Azure region.
+            vm_size: VM size used.
+
+        Returns:
+            Created VMPool.
+        """
+        pool_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self._pool = VMPool(
+            pool_id=pool_id,
+            created_at=datetime.now().isoformat(),
+            resource_group=resource_group,
+            location=location,
+            vm_size=vm_size,
+            workers=[
+                PoolWorker(name=name, ip=ip, status="ready") for name, ip in workers
+            ],
+        )
+        self.save()
+        return self._pool
+
+    def get_pool(self) -> VMPool | None:
+        """Get current pool."""
+        return self._pool
+
+    def update_worker(self, name: str, **kwargs) -> None:
+        """Update a worker's status.
+
+        Args:
+            name: Worker name.
+            **kwargs: Fields to update.
+        """
+        if self._pool is None:
+            return
+        for worker in self._pool.workers:
+            if worker.name == name:
+                for key, value in kwargs.items():
+                    if hasattr(worker, key):
+                        setattr(worker, key, value)
+                worker.updated_at = datetime.now().isoformat()
+                break
+        self.save()
+
+    def update_pool_progress(self, completed: int = 0, failed: int = 0) -> None:
+        """Update pool-level progress.
+
+        Args:
+            completed: Increment completed count by this amount.
+            failed: Increment failed count by this amount.
+        """
+        if self._pool is None:
+            return
+        self._pool.completed_tasks += completed
+        self._pool.failed_tasks += failed
+        self.save()
+
+    def delete_pool(self) -> bool:
+        """Delete the pool registry (VMs must be deleted separately).
+
+        Returns:
+            True if pool was deleted.
+        """
+        if self.registry_file.exists():
+            self.registry_file.unlink()
+            self._pool = None
+            return True
+        return False
+
+
+class VMRegistry:
+    """Manage a registry of VMs and their status."""
+
+    def __init__(
+        self, registry_file: str | Path = "benchmark_results/vm_registry.json"
+    ):
+        """Initialize registry.
+
+        Args:
+            registry_file: Path to JSON registry file.
+        """
+        self.registry_file = Path(registry_file)
+        self._vms: list[VMConfig] = []
+        self.load()
+
+    def load(self) -> None:
+        """Load VMs from registry file."""
+        if self.registry_file.exists():
+            with open(self.registry_file) as f:
+                data = json.load(f)
+                self._vms = [VMConfig.from_dict(vm) for vm in data]
+
+    def save(self) -> None:
+        """Save VMs to registry file."""
+        self.registry_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.registry_file, "w") as f:
+            json.dump([vm.to_dict() for vm in self._vms], f, indent=2)
+
+    def add(self, config: VMConfig) -> None:
+        """Add a VM to the registry."""
+        # Remove existing VM with same name
+        self._vms = [vm for vm in self._vms if vm.name != config.name]
+        self._vms.append(config)
+        self.save()
+
+    def remove(self, name: str) -> bool:
+        """Remove a VM from the registry.
+
+        Returns:
+            True if VM was found and removed.
+        """
+        original_len = len(self._vms)
+        self._vms = [vm for vm in self._vms if vm.name != name]
+        if len(self._vms) < original_len:
+            self.save()
+            return True
+        return False
+
+    def get(self, name: str) -> VMConfig | None:
+        """Get a VM by name."""
+        for vm in self._vms:
+            if vm.name == name:
+                return vm
+        return None
+
+    def list(self) -> list[VMConfig]:
+        """List all VMs."""
+        return list(self._vms)
+
+    def check_all(self, timeout: int = 5) -> list[VMStatus]:
+        """Check status of all VMs.
+
+        Args:
+            timeout: Timeout per VM check.
+
+        Returns:
+            List of VMStatus for each registered VM.
+        """
+        statuses = []
+        for config in self._vms:
+            monitor = VMMonitor(config, timeout=timeout)
+            statuses.append(monitor.check_status())
+        return statuses
+
+
+def main():
+    """CLI entry point for VM monitoring."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Monitor WAA VMs")
+    parser.add_argument("--host", help="SSH host")
+    parser.add_argument("--user", default="azureuser", help="SSH user")
+    parser.add_argument("--container", default="winarena", help="Docker container name")
+    parser.add_argument(
+        "--interval", type=int, default=30, help="Check interval in seconds"
+    )
+    parser.add_argument("--output", help="Output file for status updates (JSON lines)")
+    parser.add_argument("--list", action="store_true", help="List all registered VMs")
+    parser.add_argument(
+        "--check-all", action="store_true", help="Check all registered VMs"
+    )
+
+    args = parser.parse_args()
+
+    if args.list:
+        registry = VMRegistry()
+        for vm in registry.list():
+            print(
+                f"  {vm.name}: {vm.ssh_user}@{vm.ssh_host} (container: {vm.docker_container})"
+            )
+        return
+
+    if args.check_all:
+        registry = VMRegistry()
+        for status in registry.check_all():
+            print(f"\n{status.config.name}:")
+            print(f"  SSH: {'✓' if status.ssh_reachable else '✗'}")
+            print(f"  VNC: {'✓' if status.vnc_reachable else '✗'}")
+            print(f"  WAA: {'✓ READY' if status.waa_ready else '✗ Not ready'}")
+            if status.disk_usage_gb:
+                print(f"  Disk: {status.disk_usage_gb} GB")
+        return
+
+    if not args.host:
+        parser.error("--host is required for monitoring")
+
+    config = VMConfig(
+        name="cli-vm",
+        ssh_host=args.host,
+        ssh_user=args.user,
+        docker_container=args.container,
+    )
+
+    monitor = VMMonitor(config)
+
+    def print_status(status: VMStatus):
+        ts = datetime.now().strftime("%H:%M:%S")
+        waa_str = "READY!" if status.waa_ready else "not ready"
+        disk_str = f"{status.disk_usage_gb}GB" if status.disk_usage_gb else "?"
+        print(
+            f"[{ts}] SSH: {'✓' if status.ssh_reachable else '✗'} | "
+            f"VNC: {'✓' if status.vnc_reachable else '✗'} | "
+            f"WAA: {waa_str} | Disk: {disk_str}"
+        )
+        if status.container_logs:
+            # Show last log line
+            last_line = status.container_logs.split("\n")[-1][:80]
+            print(f"         Log: {last_line}")
+
+    print(f"Monitoring {args.host}... (Ctrl+C to stop)")
+    try:
+        final_status = monitor.run_monitor(
+            callback=print_status,
+            interval=args.interval,
+            output_file=args.output,
+        )
+        print(f"\n✓ WAA is ready! Probe response: {final_status.waa_probe_response}")
+    except KeyboardInterrupt:
+        print("\nMonitoring stopped.")
+
+
+# ============================================================================
+# Azure ML Job Tracking
+# ============================================================================
+
+
+@dataclass
+class AzureMLJob:
+    """Represents an Azure ML job."""
+
+    job_id: str
+    display_name: str
+    status: str  # running, completed, failed, canceled
+    created_at: str
+    compute_target: str | None = None
+    duration_minutes: float | None = None
+    cost_usd: float | None = None
+    azure_dashboard_url: str | None = None
+
+
+def fetch_azure_ml_jobs(
+    resource_group: str = "openadapt-agents",
+    workspace_name: str = "openadapt-ml",
+    days: int = 7,
+    max_results: int = 20,
+) -> list[AzureMLJob]:
+    """Fetch recent Azure ML jobs.
+
+    Args:
+        resource_group: Azure resource group name.
+        workspace_name: Azure ML workspace name.
+        days: Number of days to look back.
+        max_results: Maximum number of jobs to return.
+
+    Returns:
+        List of AzureMLJob objects, sorted by creation time (newest first).
+    """
+    try:
+        result = subprocess.run(
+            [
+                "az",
+                "ml",
+                "job",
+                "list",
+                "--resource-group",
+                resource_group,
+                "--workspace-name",
+                workspace_name,
+                "--query",
+                "[].{name:name,display_name:display_name,status:status,created_at:creation_context.created_at,compute:compute}",
+                "-o",
+                "json",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+
+        if result.returncode != 0:
+            logger.error(f"Azure CLI error: {result.stderr}")
+            return []
+
+        jobs_raw = json.loads(result.stdout)
+
+        # Filter by date
+        cutoff_date = datetime.now() - timedelta(days=days)
+        jobs = []
+
+        for job in jobs_raw[:max_results]:
+            created_at = job.get("created_at", "")
+            try:
+                # Parse ISO format: 2026-01-17T10:30:00Z
+                job_date = datetime.fromisoformat(
+                    created_at.replace("Z", "+00:00")
+                    if created_at
+                    else datetime.now().isoformat()
+                )
+                if job_date < cutoff_date.replace(tzinfo=job_date.tzinfo):
+                    continue
+            except (ValueError, AttributeError):
+                # If date parsing fails, include the job
+                pass
+
+            # Calculate duration for completed jobs
+            duration_minutes = None
+            status = job.get("status", "unknown").lower()
+
+            # Build Azure dashboard URL
+            subscription_id = get_azure_subscription_id()
+            wsid = f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace_name}"
+            dashboard_url = (
+                f"https://ml.azure.com/runs/{job.get('name', '')}?wsid={wsid}"
+            )
+
+            jobs.append(
+                AzureMLJob(
+                    job_id=job.get("name", "unknown"),
+                    display_name=job.get("display_name", ""),
+                    status=status,
+                    created_at=created_at,
+                    compute_target=job.get("compute", None),
+                    duration_minutes=duration_minutes,
+                    azure_dashboard_url=dashboard_url,
+                )
+            )
+
+        return jobs
+
+    except Exception as e:
+        logger.error(f"Error fetching Azure ML jobs: {e}")
+        return []
+
+
+def get_azure_subscription_id() -> str:
+    """Get the current Azure subscription ID."""
+    try:
+        result = subprocess.run(
+            ["az", "account", "show", "--query", "id", "-o", "tsv"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode == 0:
+            return result.stdout.strip()
+    except Exception:
+        pass
+    return "unknown"
+
+
+# ============================================================================
+# Cost Tracking
+# ============================================================================
+
+
+@dataclass
+class VMCostEstimate:
+    """Estimated costs for VM usage."""
+
+    vm_size: str
+    hourly_rate_usd: float
+    hours_elapsed: float
+    cost_usd: float
+    cost_per_hour_usd: float
+    cost_per_day_usd: float
+    cost_per_week_usd: float
+
+
+# Azure VM pricing (US East, as of Jan 2025)
+VM_PRICING = {
+    "Standard_D2_v3": 0.096,
+    "Standard_D4_v3": 0.192,
+    "Standard_D8_v3": 0.384,
+    "Standard_D4s_v3": 0.192,
+    "Standard_D8s_v3": 0.384,
+    "Standard_D4ds_v5": 0.192,
+    "Standard_D8ds_v5": 0.384,
+    "Standard_D16ds_v5": 0.768,
+    "Standard_D32ds_v5": 1.536,
+}
+
+
+def calculate_vm_costs(
+    vm_size: str, hours: float, hourly_rate_override: float | None = None
+) -> VMCostEstimate:
+    """Calculate VM cost estimates.
+
+    Args:
+        vm_size: Azure VM size (e.g., "Standard_D4ds_v5").
+        hours: Number of hours the VM has been running.
+        hourly_rate_override: Override default hourly rate (for custom pricing).
+
+    Returns:
+        VMCostEstimate with cost breakdown.
+    """
+    hourly_rate = hourly_rate_override or VM_PRICING.get(vm_size, 0.20)
+    cost_usd = hourly_rate * hours
+
+    return VMCostEstimate(
+        vm_size=vm_size,
+        hourly_rate_usd=hourly_rate,
+        hours_elapsed=hours,
+        cost_usd=cost_usd,
+        cost_per_hour_usd=hourly_rate,
+        cost_per_day_usd=hourly_rate * 24,
+        cost_per_week_usd=hourly_rate * 24 * 7,
+    )
+
+
+def get_vm_uptime_hours(
+    resource_group: str, vm_name: str, check_actual_state: bool = True
+) -> float:
+    """Get VM uptime in hours.
+
+    Args:
+        resource_group: Azure resource group.
+        vm_name: VM name.
+        check_actual_state: If True, check if VM is actually running.
+
+    Returns:
+        Hours since VM started, or 0 if VM is not running.
+    """
+    try:
+        # Get VM creation time or last start time
+        result = subprocess.run(
+            [
+                "az",
+                "vm",
+                "show",
+                "-d",
+                "-g",
+                resource_group,
+                "-n",
+                vm_name,
+                "--query",
+                "{powerState:powerState}",
+                "-o",
+                "json",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+
+        if result.returncode != 0:
+            return 0.0
+
+        info = json.loads(result.stdout)
+        power_state = info.get("powerState", "")
+
+        # Check if VM is running
+        if check_actual_state and "running" not in power_state.lower():
+            return 0.0
+
+        # Try to get activity logs for last start time
+        result = subprocess.run(
+            [
+                "az",
+                "monitor",
+                "activity-log",
+                "list",
+                "--resource-group",
+                resource_group,
+                "--resource-id",
+                f"/subscriptions/{get_azure_subscription_id()}/resourceGroups/{resource_group}/providers/Microsoft.Compute/virtualMachines/{vm_name}",
+                "--query",
+                "[?operationName.localizedValue=='Start Virtual Machine' || operationName.localizedValue=='Create or Update Virtual Machine'].eventTimestamp | [0]",
+                "-o",
+                "tsv",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+
+        if result.returncode == 0 and result.stdout.strip():
+            start_time_str = result.stdout.strip()
+            start_time = datetime.fromisoformat(start_time_str.replace("Z", "+00:00"))
+            elapsed = datetime.now(start_time.tzinfo) - start_time
+            return elapsed.total_seconds() / 3600
+
+        # Fallback: assume started 1 hour ago if we can't determine
+        return 1.0
+
+    except Exception as e:
+        logger.debug(f"Error getting VM uptime: {e}")
+        return 0.0
+
+
+# ============================================================================
+# VM Activity Detection
+# ============================================================================
+
+
+@dataclass
+class VMActivity:
+    """Current VM activity information."""
+
+    is_active: bool
+    activity_type: str  # idle, benchmark_running, training, setup, unknown
+    description: str
+    benchmark_progress: dict | None = None  # If benchmark is running
+    last_action_time: str | None = None
+
+
+def detect_vm_activity(
+    ip: str,
+    ssh_user: str = "azureuser",
+    docker_container: str = "winarena",
+    internal_ip: str = "localhost",  # WAA server bound to localhost via Docker port forward
+) -> VMActivity:
+    """Detect what the VM is currently doing.
+
+    Args:
+        ip: VM IP address.
+        ssh_user: SSH username.
+        docker_container: Docker container name.
+        internal_ip: Internal IP for WAA server.
+
+    Returns:
+        VMActivity with current activity information.
+    """
+    try:
+        # Check if container is running
+        result = subprocess.run(
+            [
+                "ssh",
+                "-o",
+                "StrictHostKeyChecking=no",
+                "-o",
+                "ConnectTimeout=5",
+                f"{ssh_user}@{ip}",
+                f"docker ps -q -f name={docker_container}",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+
+        if result.returncode != 0 or not result.stdout.strip():
+            return VMActivity(
+                is_active=False,
+                activity_type="idle",
+                description="Container not running",
+            )
+
+        # Check WAA probe for benchmark status
+        result = subprocess.run(
+            [
+                "ssh",
+                "-o",
+                "StrictHostKeyChecking=no",
+                "-o",
+                "ConnectTimeout=5",
+                f"{ssh_user}@{ip}",
+                f"curl -s --connect-timeout 3 http://{internal_ip}:5000/probe",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+
+        if result.returncode == 0 and result.stdout.strip():
+            probe_response = result.stdout.strip()
+            try:
+                probe_data = json.loads(probe_response)
+                # WAA is ready and responsive - check if benchmark is actually running
+                # by looking for python processes (Navi agent or our client)
+                python_check = subprocess.run(
+                    [
+                        "ssh",
+                        "-o",
+                        "StrictHostKeyChecking=no",
+                        "-o",
+                        "ConnectTimeout=5",
+                        f"{ssh_user}@{ip}",
+                        f"docker exec {docker_container} pgrep -f 'python.*run' 2>/dev/null | head -1",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                is_running = bool(python_check.stdout.strip())
+
+                return VMActivity(
+                    is_active=is_running,
+                    activity_type="benchmark_running" if is_running else "idle",
+                    description="WAA benchmark running"
+                    if is_running
+                    else "WAA ready - idle",
+                    benchmark_progress=probe_data,
+                )
+            except json.JSONDecodeError:
+                # Got response but not JSON - maybe setup phase
+                return VMActivity(
+                    is_active=True,
+                    activity_type="setup",
+                    description="WAA starting up",
+                )
+
+        # Container running but WAA not ready
+        return VMActivity(
+            is_active=True,
+            activity_type="setup",
+            description="Windows VM booting or WAA initializing",
+        )
+
+    except Exception as e:
+        logger.debug(f"Error detecting VM activity: {e}")
+        return VMActivity(
+            is_active=False,
+            activity_type="unknown",
+            description=f"Error checking activity: {str(e)[:100]}",
+        )
+
+
+# ============================================================================
+# Evaluation History
+# ============================================================================
+
+
+@dataclass
+class EvaluationRun:
+    """Historical evaluation run."""
+
+    run_id: str
+    started_at: str
+    completed_at: str | None
+    num_tasks: int
+    success_rate: float | None
+    agent_type: str
+    status: str  # running, completed, failed
+
+
+def get_evaluation_history(
+    results_dir: Path | str = "benchmark_results", max_runs: int = 10
+) -> list[EvaluationRun]:
+    """Get history of evaluation runs from results directory.
+
+    Args:
+        results_dir: Path to benchmark results directory.
+        max_runs: Maximum number of runs to return.
+
+    Returns:
+        List of EvaluationRun objects, sorted by start time (newest first).
+    """
+    results_path = Path(results_dir)
+    if not results_path.exists():
+        return []
+
+    runs = []
+
+    # Look for run directories or result files
+    for item in sorted(results_path.iterdir(), reverse=True):
+        if item.is_dir():
+            # Check for summary.json or similar
+            summary_file = item / "summary.json"
+            if summary_file.exists():
+                try:
+                    summary = json.loads(summary_file.read_text())
+                    runs.append(
+                        EvaluationRun(
+                            run_id=item.name,
+                            started_at=summary.get("started_at", "unknown"),
+                            completed_at=summary.get("completed_at", None),
+                            num_tasks=summary.get("num_tasks", 0),
+                            success_rate=summary.get("success_rate", None),
+                            agent_type=summary.get("agent_type", "unknown"),
+                            status=summary.get("status", "completed"),
+                        )
+                    )
+                except (json.JSONDecodeError, KeyError):
+                    continue
+
+        if len(runs) >= max_runs:
+            break
+
+    return runs
+
+
+if __name__ == "__main__":
+    main()
diff --git a/openadapt_evals/waa_deploy/Dockerfile b/openadapt_evals/waa_deploy/Dockerfile
new file mode 100644
index 0000000..02d0817
--- /dev/null
+++ b/openadapt_evals/waa_deploy/Dockerfile
@@ -0,0 +1,217 @@
+# =============================================================================
+# WAA (Windows Agent Arena) Docker Image
+# =============================================================================
+#
+# This image combines:
+# 1. dockurr/windows:latest - Modern base that auto-downloads Windows 11
+# 2. windowsarena/winarena:latest - Official WAA benchmark client and scripts
+#
+# The official windowsarena/winarena uses an outdated dockurr/windows (v0.00)
+# that doesn't auto-download Windows. This image fixes that while keeping
+# full compatibility with the official WAA benchmark.
+#
+# Usage:
+#   # Build the image
+#   docker build -t waa-auto:latest .
+#
+#   # Run benchmark (after Windows is set up)
+#   docker run --rm --device=/dev/kvm --cap-add NET_ADMIN \
+#     -p 8006:8006 -p 5000:5000 -p 7200:7200 \
+#     -v /path/to/storage:/storage \
+#     -e OPENAI_API_KEY="your-key" \
+#     waa-auto:latest \
+#     "/entry.sh --start-client true --model gpt-4o --num-tasks 5"
+#
+# =============================================================================
+
+FROM dockurr/windows:latest
+
+# -----------------------------------------------------------------------------
+# Copy official WAA components from windowsarena/winarena
+# -----------------------------------------------------------------------------
+
+# Copy benchmark client scripts
+COPY --from=windowsarena/winarena:latest /entry.sh /entry.sh
+COPY --from=windowsarena/winarena:latest /entry_setup.sh /entry_setup.sh
+COPY --from=windowsarena/winarena:latest /start_client.sh /start_client.sh
+
+# Copy the Python benchmark client code
+COPY --from=windowsarena/winarena:latest /client /client
+
+# Copy our WAA server startup script
+COPY start_waa_server.bat /oem/start_waa_server.bat
+
+# Copy model weights (GroundingDINO, OmniParser, etc.)
+COPY --from=windowsarena/winarena:latest /models /models
+
+# Copy Windows setup scripts (install.bat, setup.ps1, etc.)
+COPY --from=windowsarena/winarena:latest /oem /oem
+
+# Copy OEM files AFTER dockurr/samba starts (which wipes /tmp/smb)
+# Copy IMMEDIATELY (no delay) and SYNCHRONOUSLY (not backgrounded) to ensure
+# files are available before Windows boots and runs FirstLogonCommands
+RUN sed -i '/^return 0$/i cp -r /oem/* /tmp/smb/ 2>/dev/null || true' /run/samba.sh && \
+    echo "Inserted OEM copy before return in samba.sh"
+
+# DO NOT replace dockurr/windows's autounattend.xml - it handles OOBE properly
+# Instead, only PATCH it to add InstallFrom element (prevents "Select OS" dialog)
+# This preserves dockurr/windows's native OEM mechanism
+RUN for xml in /run/assets/win11x64.xml /run/assets/win11x64-enterprise-eval.xml; do \
+        if [ -f "$xml" ] && ! grep -q "InstallFrom" "$xml"; then \
+            sed -i 's|<InstallTo>|<InstallFrom>\n            <MetaData wcm:action="add">\n              <Key>/IMAGE/INDEX</Key>\n              <Value>1</Value>\n            </MetaData>\n          </InstallFrom>\n          <InstallTo>|' "$xml"; \
+        fi; \
+    done && echo "Added InstallFrom element for automatic image selection"
+
+# -----------------------------------------------------------------------------
+# Create start_vm.sh that uses our dockurr/windows entrypoint
+# -----------------------------------------------------------------------------
+
+RUN printf '#!/bin/bash\n/usr/bin/tini -s /run/entry.sh\n' > /start_vm.sh && chmod +x /start_vm.sh
+
+# -----------------------------------------------------------------------------
+# Patch IP addresses: official uses 20.20.20.21, dockurr/windows uses 172.30.0.2
+# -----------------------------------------------------------------------------
+
+# Patch entry scripts (must work - these files were just copied)
+RUN sed -i 's|20.20.20.21|172.30.0.2|g' /entry_setup.sh && \
+    sed -i 's|20.20.20.21|172.30.0.2|g' /entry.sh && \
+    sed -i 's|20.20.20.21|172.30.0.2|g' /start_client.sh && \
+    echo "Patched entry scripts"
+
+# Patch client Python files
+RUN find /client -name "*.py" -exec sed -i 's|20.20.20.21|172.30.0.2|g' {} \; && \
+    echo "Patched client Python files"
+
+# -----------------------------------------------------------------------------
+# Add API-backed agent support (Claude Sonnet 4.5 / GPT-5.1)
+# This allows using --agent api-claude or --agent api-openai instead of navi
+# -----------------------------------------------------------------------------
+
+# Copy api_agent.py to the client mm_agents directory
+COPY api_agent.py /client/mm_agents/api_agent.py
+
+# Note: API agent patching (api-claude, api-openai) skipped for now
+# The navi agent works out of the box - API agents can be added later
+
+# -----------------------------------------------------------------------------
+# Fix Windows setup for automation
+# -----------------------------------------------------------------------------
+
+# Set password for AutoLogon (Windows 11 requires password for login)
+RUN sed -i 's|<Value></Value>|<Value>docker</Value>|g' /run/assets/win11x64.xml 2>/dev/null || true
+RUN sed -i 's|<Value />|<Value>docker</Value>|g' /run/assets/win11x64.xml 2>/dev/null || true
+
+# Add firewall disable and other automation commands to FirstLogonCommands
+# CRITICAL: Also create a scheduled task so WAA server starts on EVERY boot, not just first logon
+RUN if grep -q "</FirstLogonCommands>" /run/assets/win11x64.xml; then \
+    LAST_ORDER=$(grep -oP "Order>\K[0-9]+" /run/assets/win11x64.xml | sort -n | tail -1) && \
+    N1=$((LAST_ORDER + 1)) && \
+    N2=$((LAST_ORDER + 2)) && \
+    N3=$((LAST_ORDER + 3)) && \
+    N4=$((LAST_ORDER + 4)) && \
+    N5=$((LAST_ORDER + 5)) && \
+    N6=$((LAST_ORDER + 6)) && \
+    sed -i "s|</FirstLogonCommands>|\
+        <SynchronousCommand wcm:action=\"add\">\n\
+          <Order>$N1</Order>\n\
+          <CommandLine>netsh advfirewall set allprofiles state off</CommandLine>\n\
+          <Description>Disable Windows Firewall</Description>\n\
+        </SynchronousCommand>\n\
+        <SynchronousCommand wcm:action=\"add\">\n\
+          <Order>$N2</Order>\n\
+          <CommandLine>powercfg /change standby-timeout-ac 0</CommandLine>\n\
+          <Description>Disable sleep</Description>\n\
+        </SynchronousCommand>\n\
+        <SynchronousCommand wcm:action=\"add\">\n\
+          <Order>$N3</Order>\n\
+          <CommandLine>powercfg /change monitor-timeout-ac 0</CommandLine>\n\
+          <Description>Disable monitor timeout</Description>\n\
+        </SynchronousCommand>\n\
+        <SynchronousCommand wcm:action=\"add\">\n\
+          <Order>$N4</Order>\n\
+          <CommandLine>reg add \"HKLM\\\\SOFTWARE\\\\Policies\\\\Microsoft\\\\Windows\\\\Personalization\" /v NoLockScreen /t REG_DWORD /d 1 /f</CommandLine>\n\
+          <Description>Disable lock screen</Description>\n\
+        </SynchronousCommand>\n\
+        <SynchronousCommand wcm:action=\"add\">\n\
+          <Order>$N5</Order>\n\
+          <CommandLine>cmd /c start /wait \\\\\\\\host.lan\\\\Data\\\\install.bat</CommandLine>\n\
+          <Description>Run WAA setup script to install Python, Chrome, etc.</Description>\n\
+        </SynchronousCommand>\n\
+        <SynchronousCommand wcm:action=\"add\">\n\
+          <Order>$N6</Order>\n\
+          <CommandLine>schtasks /create /tn \"WAAServer\" /tr \"\\\\\\\\host.lan\\\\Data\\\\start_waa_server.bat\" /sc onlogon /rl highest /f</CommandLine>\n\
+          <Description>Create scheduled task for WAA server auto-start on every boot</Description>\n\
+        </SynchronousCommand>\n\
+        <SynchronousCommand wcm:action=\"add\">\n\
+          <Order>$((N6 + 1))</Order>\n\
+          <CommandLine>reg add \"HKCU\\\\SOFTWARE\\\\Microsoft\\\\Windows\\\\CurrentVersion\\\\Run\" /v WAAServer /t REG_SZ /d \"cmd /c \\\\\\\\host.lan\\\\Data\\\\start_waa_server.bat\" /f</CommandLine>\n\
+          <Description>Add registry entry for WAA server auto-start (backup)</Description>\n\
+        </SynchronousCommand>\n\
+        <SynchronousCommand wcm:action=\"add\">\n\
+          <Order>$((N6 + 2))</Order>\n\
+          <CommandLine>\\\\\\\\host.lan\\\\Data\\\\start_waa_server.bat</CommandLine>\n\
+          <Description>Start WAA server immediately</Description>\n\
+        </SynchronousCommand>\n\
+      </FirstLogonCommands>|" /run/assets/win11x64.xml; \
+    fi
+
+# -----------------------------------------------------------------------------
+# Copy Python 3.9 and all packages from vanilla image
+# -----------------------------------------------------------------------------
+# IMPORTANT: Do NOT install Python from apt or pip install packages ourselves.
+# The vanilla image has Python 3.9.20 with transformers 4.46.2 which is compatible
+# with GroundingDINO. Installing our own Python (3.13) with latest transformers (5.0)
+# breaks the navi agent with: AttributeError: 'BertModel' has no attribute 'get_head_mask'
+
+# Copy Python 3.9 installation from vanilla (binaries, libraries, packages)
+COPY --from=windowsarena/winarena:latest /usr/local/bin/python* /usr/local/bin/
+COPY --from=windowsarena/winarena:latest /usr/local/bin/pip* /usr/local/bin/
+COPY --from=windowsarena/winarena:latest /usr/local/lib/python3.9 /usr/local/lib/python3.9
+COPY --from=windowsarena/winarena:latest /usr/local/lib/libpython3.9.so* /usr/local/lib/
+COPY --from=windowsarena/winarena:latest /usr/local/include/python3.9 /usr/local/include/python3.9
+
+# Ensure the shared library is found
+RUN ldconfig
+
+# Create symlinks for python/pip commands
+RUN ln -sf /usr/local/bin/python3.9 /usr/local/bin/python && \
+    ln -sf /usr/local/bin/python3.9 /usr/bin/python && \
+    ln -sf /usr/local/bin/python3.9 /usr/bin/python3 && \
+    ln -sf /usr/local/bin/pip3.9 /usr/local/bin/pip && \
+    ln -sf /usr/local/bin/pip3.9 /usr/bin/pip && \
+    ln -sf /usr/local/bin/pip3.9 /usr/bin/pip3
+
+# Install only system dependencies that Python packages need (not Python itself)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tesseract-ocr \
+    libgl1 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+# Note: Playwright browsers not copied - not needed for navi agent (uses GroundingDINO)
+# If needed later, install via: python -m playwright install chromium
+
+# -----------------------------------------------------------------------------
+# Environment configuration
+# -----------------------------------------------------------------------------
+
+ENV YRES="900"
+ENV XRES="1440"
+ENV RAM_SIZE="8G"
+ENV CPU_CORES="4"
+ENV DISK_SIZE="30G"
+ENV VERSION="11e"
+ENV ARGUMENTS="-qmp tcp:0.0.0.0:7200,server,nowait"
+
+# Expose ports
+EXPOSE 8006 5000 7200 3389
+
+# Default entrypoint - use dockurr/windows's native entry point
+# The OEM files are copied by samba.sh (patched above) when Samba starts
+# dockurr/windows handles: QEMU VM startup, Samba, VNC, Windows boot
+# Our patched autounattend.xml handles: FirstLogonCommands that run install.bat
+ENTRYPOINT ["/usr/bin/tini", "-s", "/run/entry.sh"]
diff --git a/openadapt_evals/waa_deploy/__init__.py b/openadapt_evals/waa_deploy/__init__.py
new file mode 100644
index 0000000..e6e6821
--- /dev/null
+++ b/openadapt_evals/waa_deploy/__init__.py
@@ -0,0 +1,10 @@
+"""WAA (Windows Agent Arena) deployment module.
+
+This module contains files that are deployed into the WAA Docker container:
+- api_agent.py: API-based agent (Claude/GPT-5.1) for WAA
+- Dockerfile: Custom waa-auto Docker image
+"""
+
+from openadapt_evals.waa_deploy.api_agent import ApiAgent
+
+__all__ = ["ApiAgent"]
diff --git a/openadapt_evals/waa_deploy/api_agent.py b/openadapt_evals/waa_deploy/api_agent.py
new file mode 100644
index 0000000..85fc38f
--- /dev/null
+++ b/openadapt_evals/waa_deploy/api_agent.py
@@ -0,0 +1,540 @@
+"""WAA-compatible API Agent that uses Claude Sonnet 4.5 or GPT-5.1 directly.
+
+This module provides a drop-in replacement for the Navi agent in Windows Agent Arena
+that uses hosted VLM APIs (Claude or GPT-5.1) instead of the buggy Navi agent.
+
+The agent receives observations from WAA and returns actions in WAA's expected format
+(code blocks for the pyautogui action space).
+
+Why this exists:
+    The default Navi agent in WAA has NoneType errors and other bugs.
+    This API agent provides a reliable alternative that uses Claude Sonnet 4.5
+    or GPT-5.1 directly, bypassing the problematic Navi implementation.
+
+Usage from CLI:
+    # Run with Claude Sonnet 4.5 (requires ANTHROPIC_API_KEY)
+    uv run python -m openadapt_evals.cli vm run-waa --agent api-claude --num-tasks 5
+
+    # Run with GPT-5.1 (requires OPENAI_API_KEY)
+    uv run python -m openadapt_evals.cli vm run-waa --agent api-openai --num-tasks 5
+
+How it works:
+    1. The Dockerfile copies this file to /client/mm_agents/api_agent.py
+    2. The Dockerfile patches run.py to recognize "api-claude" and "api-openai" agents
+    3. When the agent is selected, it:
+       - Receives screenshots from WAA's DesktopEnv
+       - Sends them to Claude or GPT-5.1 via their respective APIs
+       - Parses the response into pyautogui code blocks
+       - Returns actions in WAA's expected format
+
+Example usage in WAA run.py (auto-patched by Dockerfile):
+    if cfg_args["agent_name"] == "api-claude":
+        from mm_agents.api_agent import ApiAgent
+        agent = ApiAgent(provider="anthropic")
+    elif cfg_args["agent_name"] == "api-openai":
+        from mm_agents.api_agent import ApiAgent
+        agent = ApiAgent(provider="openai")
+"""
+
+from __future__ import annotations
+
+import base64
+import logging
+import os
+import re
+from io import BytesIO
+from typing import Dict, List
+
+from PIL import Image
+
+logger = logging.getLogger("desktopenv.agent.api")
+
+
+# System prompt for GUI automation - adapted from APIBenchmarkAgent
+SYSTEM_PROMPT = """You are a GUI automation agent controlling a Windows desktop. Given a screenshot and task instruction, determine the next action to take.
+
+You must respond with a Python code block that uses the pyautogui API. Available functions:
+- computer.click(x, y) - Click at pixel coordinates
+- computer.double_click(x, y) - Double-click at pixel coordinates
+- computer.right_click(x, y) - Right-click at pixel coordinates
+- computer.type(text) - Type the given text
+- computer.hotkey(key1, key2, ...) - Press key combination (e.g., 'ctrl', 'c')
+- computer.press(key) - Press a single key (e.g., 'enter', 'tab', 'escape')
+- computer.scroll(direction) - Scroll up (-3) or down (3)
+- computer.drag(x1, y1, x2, y2) - Drag from (x1,y1) to (x2,y2)
+
+Coordinates are pixel values within the screen (1920x1200 by default).
+
+Format your response as:
+
+```memory
+# Your notes about the task state (optional)
+```
+
+```decision
+CONTINUE
+```
+
+```python
+computer.click(500, 300)
+```
+
+Important:
+- Use DONE in the decision block when the task is complete
+- Use FAIL if the task cannot be completed
+- Always output exactly one action per response
+- Click on UI elements by their visual center coordinates
+- For text input, first click to focus the field, then type
+
+Think step by step:
+1. What is the current state of the UI?
+2. What is the goal?
+3. What is the next logical action?
+"""
+
+
+def format_accessibility_tree(tree: dict, indent: int = 0, max_depth: int = 5) -> str:
+    """Format accessibility tree for prompt.
+
+    Args:
+        tree: Accessibility tree dict from WAA.
+        indent: Current indentation level.
+        max_depth: Maximum depth to traverse.
+
+    Returns:
+        Formatted string representation.
+    """
+    if indent >= max_depth:
+        return ""
+
+    lines = []
+    prefix = "  " * indent
+
+    role = tree.get("role", tree.get("control_type", "unknown"))
+    name = tree.get("name", "")
+    node_id = tree.get("id", tree.get("node_id", ""))
+
+    # Get bounding box if available
+    bbox_str = ""
+    if "bounding_rectangle" in tree:
+        br = tree["bounding_rectangle"]
+        bbox_str = f" [{br.get('left', 0)},{br.get('top', 0)},{br.get('right', 0)},{br.get('bottom', 0)}]"
+
+    line = f"{prefix}[{node_id}] {role}"
+    if name:
+        line += f": {name[:50]}"  # Truncate long names
+    if bbox_str:
+        line += bbox_str
+    lines.append(line)
+
+    for child in tree.get("children", []):
+        child_text = format_accessibility_tree(child, indent + 1, max_depth)
+        if child_text:
+            lines.append(child_text)
+
+    return "\n".join(lines)
+
+
+def prev_actions_to_string(prev_actions: List[str], n_prev: int = 3) -> str:
+    """Format previous actions for the prompt.
+
+    Args:
+        prev_actions: List of previous action strings.
+        n_prev: Number of previous actions to include.
+
+    Returns:
+        Formatted string of previous actions.
+    """
+    result = ""
+    n_prev = min(n_prev, len(prev_actions))
+    for i in range(1, n_prev + 1):
+        action = prev_actions[-i]
+        result += f"Action at T-{i}:\n{action}\n\n"
+    return result
+
+
+class ApiAgent:
+    """WAA-compatible agent that uses Claude or GPT-5.1 API directly.
+
+    This agent implements the same interface as NaviAgent but uses hosted
+    VLM APIs instead of the local Navi implementation (which has NoneType bugs).
+
+    Args:
+        provider: API provider - "anthropic" (Claude) or "openai" (GPT-5.1).
+        api_key: Optional API key. If not provided, uses environment variables.
+        model: Optional model name override.
+        temperature: Sampling temperature (0.0-1.0).
+        max_tokens: Maximum tokens for API response.
+        use_accessibility_tree: Whether to include a11y tree in prompts.
+        use_history: Whether to include action history in prompts.
+        demo: Optional demonstration trajectory to include at every step.
+              This is the key fix for 100% first-action / 0% episode success:
+              the demo must persist across ALL steps, not just step 1.
+    """
+
+    # Default models for each provider
+    DEFAULT_MODELS = {
+        "anthropic": "claude-sonnet-4-5-20250929",
+        "openai": "gpt-5.1",
+    }
+
+    def __init__(
+        self,
+        provider: str = "anthropic",
+        api_key: str | None = None,
+        model: str | None = None,
+        temperature: float = 0.5,
+        max_tokens: int = 1500,
+        use_accessibility_tree: bool = True,
+        use_history: bool = True,
+        demo: str | None = None,
+    ):
+        self.provider = provider
+        self.model = model or self.DEFAULT_MODELS.get(provider)
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.use_accessibility_tree = use_accessibility_tree
+        self.use_history = use_history
+        self.demo = demo  # Demo persists across ALL steps
+
+        # WAA compatibility
+        self.action_space = "code_block"
+
+        # Get API key
+        if provider == "anthropic":
+            self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
+            if not self.api_key:
+                raise RuntimeError(
+                    "ANTHROPIC_API_KEY is required for provider='anthropic'. "
+                    "Set it in environment or pass api_key parameter."
+                )
+            try:
+                from anthropic import Anthropic
+
+                self._client = Anthropic(api_key=self.api_key)
+            except ImportError:
+                raise RuntimeError(
+                    "anthropic package required. Install with: pip install anthropic"
+                )
+
+        elif provider == "openai":
+            self.api_key = api_key or os.getenv("OPENAI_API_KEY")
+            if not self.api_key:
+                raise RuntimeError(
+                    "OPENAI_API_KEY is required for provider='openai'. "
+                    "Set it in environment or pass api_key parameter."
+                )
+            try:
+                from openai import OpenAI
+
+                self._client = OpenAI(api_key=self.api_key)
+            except ImportError:
+                raise RuntimeError(
+                    "openai package required. Install with: pip install openai"
+                )
+        else:
+            raise ValueError(f"Unsupported provider: {provider}")
+
+        # State tracking
+        self.prev_actions: List[str] = []  # Raw action codes for WAA compatibility
+        self.history: List[str] = []  # Rich history with reasoning (like PC Agent-E)
+        self.history_cutoff = 10  # Max history entries to include
+        self.memory_block_text = "# empty memory block"
+        self.step_counter = 0
+
+        logger.info(
+            f"ApiAgent initialized with provider={provider}, model={self.model}"
+        )
+        if self.demo:
+            logger.info(
+                f"Demo trajectory provided ({len(self.demo)} chars) - will persist across all steps"
+            )
+
+    def predict(self, instruction: str, obs: Dict) -> tuple:
+        """Predict the next action based on observation.
+
+        This method implements the same interface as NaviAgent.predict().
+
+        Args:
+            instruction: The task instruction.
+            obs: Observation dict containing:
+                - screenshot: PNG bytes of current screen
+                - accessibility_tree: A11y tree dict (optional)
+                - window_title: Current window title
+                - window_names_str: List of open windows
+                - computer_clipboard: Current clipboard content
+
+        Returns:
+            Tuple of (response_text, actions_list, logs_dict, computer_update_args)
+        """
+        logs = {}
+        self.step_counter += 1
+
+        # Extract screenshot
+        screenshot_bytes = obs.get("screenshot")
+        if screenshot_bytes is None:
+            logger.error("No screenshot in observation")
+            return "", ["# No screenshot available"], logs, {}
+
+        # Convert screenshot to PIL Image
+        try:
+            image = Image.open(BytesIO(screenshot_bytes))
+            w, h = image.size
+        except Exception as e:
+            logger.error(f"Failed to load screenshot: {e}")
+            return "", ["# Failed to load screenshot"], logs, {}
+
+        logs["image_width"] = w
+        logs["image_height"] = h
+
+        # Build the prompt
+        content_parts = [f"TASK: {instruction}"]
+
+        # CRITICAL FIX: Include demo at EVERY step, not just step 1
+        # This is the key fix for 100% first-action / 0% episode success
+        if self.demo:
+            content_parts.append(
+                f"DEMONSTRATION (follow this pattern):\n"
+                f"---\n{self.demo}\n---\n"
+                f"Use the demonstration above as a guide. You are currently at step {self.step_counter}."
+            )
+            logs["demo_included"] = True
+            logs["demo_length"] = len(self.demo)
+
+        # Add context
+        window_title = obs.get("window_title", "")
+        if window_title:
+            content_parts.append(f"Current window: {window_title}")
+            logs["window_title"] = window_title
+
+        window_names_str = obs.get("window_names_str", "")
+        if window_names_str:
+            content_parts.append(f"Open windows: {window_names_str}")
+            logs["window_names_str"] = window_names_str
+
+        clipboard = obs.get("computer_clipboard", "")
+        if clipboard:
+            content_parts.append(f"Clipboard: {clipboard[:100]}")
+            logs["computer_clipboard"] = clipboard
+
+        # Add accessibility tree if available and enabled
+        if self.use_accessibility_tree:
+            a11y_tree = obs.get("accessibility_tree")
+            if a11y_tree:
+                tree_str = format_accessibility_tree(a11y_tree)
+                # Truncate if too long
+                if len(tree_str) > 4000:
+                    tree_str = tree_str[:4000] + "\n... (truncated)"
+                content_parts.append(f"UI Elements:\n{tree_str}")
+                logs["accessibility_tree_len"] = len(tree_str)
+
+        # Add action history if enabled (enhanced: includes reasoning, not just raw actions)
+        if self.use_history and self.history:
+            # Use rich history with reasoning (like PC Agent-E)
+            history_entries = self.history[-self.history_cutoff :]
+            history_str = "\n\n".join(
+                f"[Step {i + 1}] {entry}" for i, entry in enumerate(history_entries)
+            )
+            content_parts.append(f"History of previous steps:\n{history_str}")
+            logs["history_entries"] = len(history_entries)
+        elif self.use_history and self.prev_actions:
+            # Fallback to raw action history
+            history_str = prev_actions_to_string(self.prev_actions, n_prev=5)
+            content_parts.append(f"Previous actions:\n{history_str}")
+
+        # Add memory block
+        content_parts.append(f"Your memory:\n```memory\n{self.memory_block_text}\n```")
+
+        content_parts.append(f"\nScreen dimensions: {w}x{h} pixels")
+        content_parts.append("\nWhat is the next action?")
+
+        user_prompt = "\n\n".join(content_parts)
+        logs["user_question"] = user_prompt
+
+        # Call the API
+        try:
+            response_text = self._call_api(screenshot_bytes, user_prompt)
+        except Exception as e:
+            logger.error(f"API call failed: {e}")
+            return "", ["# API call failed"], logs, {}
+
+        logs["plan_result"] = response_text
+
+        # Extract memory block
+        memory_match = re.search(r"```memory\n(.*?)```", response_text, re.DOTALL)
+        if memory_match:
+            self.memory_block_text = memory_match.group(1).strip()
+
+        # Extract decision block
+        decision_match = re.search(r"```decision\n(.*?)```", response_text, re.DOTALL)
+        if decision_match:
+            decision = decision_match.group(1).strip().upper()
+            if "DONE" in decision:
+                self.prev_actions.append("DONE")
+                return "", ["DONE"], logs, {}
+            elif "FAIL" in decision:
+                self.prev_actions.append("FAIL")
+                return "", ["FAIL"], logs, {}
+            elif "WAIT" in decision:
+                self.prev_actions.append("WAIT")
+                return "", ["WAIT"], logs, {}
+
+        # Extract Python code block
+        code_match = re.search(r"```python\n(.*?)```", response_text, re.DOTALL)
+        if code_match:
+            code_text = code_match.group(1).strip()
+            actions = [code_text]
+            self.prev_actions.append(code_text)
+            # Store rich history with reasoning (memory + action)
+            self._add_to_history(
+                f"Thought: {self.memory_block_text}\nAction: {code_text}"
+            )
+        else:
+            # Try to extract action from response text
+            action = self._parse_action_from_text(response_text, w, h)
+            if action:
+                actions = [action]
+                self.prev_actions.append(action)
+                self._add_to_history(
+                    f"Thought: {self.memory_block_text}\nAction: {action}"
+                )
+            else:
+                logger.warning("Could not extract action from response")
+                actions = ["# Could not parse action"]
+
+        # Build computer_update_args (for WAA compatibility)
+        computer_update_args = {
+            "rects": [],
+            "window_rect": [0, 0, w, h],
+            "screenshot": image,
+            "scale": (1.0, 1.0),
+            "clipboard_content": clipboard,
+            "swap_ctrl_alt": False,
+        }
+
+        return "", actions, logs, computer_update_args
+
+    def _call_api(self, screenshot_bytes: bytes, user_prompt: str) -> str:
+        """Call the VLM API with screenshot and prompt.
+
+        Args:
+            screenshot_bytes: PNG image bytes.
+            user_prompt: User prompt text.
+
+        Returns:
+            Response text from the API.
+        """
+        image_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
+
+        if self.provider == "anthropic":
+            content = [
+                {"type": "text", "text": user_prompt},
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": image_b64,
+                    },
+                },
+            ]
+
+            resp = self._client.messages.create(
+                model=self.model,
+                max_tokens=self.max_tokens,
+                system=SYSTEM_PROMPT,
+                messages=[{"role": "user", "content": content}],
+            )
+
+            # Extract text from response
+            parts = getattr(resp, "content", [])
+            texts = [
+                getattr(p, "text", "")
+                for p in parts
+                if getattr(p, "type", "") == "text"
+            ]
+            return "\n".join([t for t in texts if t]).strip()
+
+        elif self.provider == "openai":
+            messages = [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": user_prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                        },
+                    ],
+                },
+            ]
+
+            resp = self._client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                max_completion_tokens=self.max_tokens,
+                temperature=self.temperature,
+            )
+            return resp.choices[0].message.content or ""
+
+        raise ValueError(f"Unsupported provider: {self.provider}")
+
+    def _parse_action_from_text(self, text: str, width: int, height: int) -> str | None:
+        """Try to parse an action from free-form text response.
+
+        Args:
+            text: Response text to parse.
+            width: Screen width.
+            height: Screen height.
+
+        Returns:
+            Python code string or None if parsing failed.
+        """
+        # Try to find click coordinates
+        click_match = re.search(r"click.*?(\d+)\s*,\s*(\d+)", text, re.IGNORECASE)
+        if click_match:
+            x, y = int(click_match.group(1)), int(click_match.group(2))
+            return f"computer.click({x}, {y})"
+
+        # Try to find type text
+        type_match = re.search(r'type[:\s]+["\'](.+?)["\']', text, re.IGNORECASE)
+        if type_match:
+            text_to_type = type_match.group(1)
+            return f'computer.type("{text_to_type}")'
+
+        # Try to find key press
+        key_match = re.search(r"press[:\s]+(\w+)", text, re.IGNORECASE)
+        if key_match:
+            key = key_match.group(1).lower()
+            return f'computer.press("{key}")'
+
+        # Try to find hotkey
+        hotkey_match = re.search(r"hotkey[:\s]+(\w+)\s*\+\s*(\w+)", text, re.IGNORECASE)
+        if hotkey_match:
+            key1, key2 = hotkey_match.group(1).lower(), hotkey_match.group(2).lower()
+            return f'computer.hotkey("{key1}", "{key2}")'
+
+        return None
+
+    def _add_to_history(self, entry: str) -> None:
+        """Add an entry to the rich history (reasoning + action)."""
+        self.history.append(entry)
+
+    def set_demo(self, demo: str) -> None:
+        """Set or update the demo trajectory.
+
+        This allows setting the demo after initialization,
+        useful for dynamic demo retrieval.
+        """
+        self.demo = demo
+        logger.info(f"Demo set ({len(demo)} chars) - will persist across all steps")
+
+    def reset(self) -> None:
+        """Reset agent state between tasks."""
+        self.prev_actions = []
+        self.history = []  # Clear rich history too
+        self.memory_block_text = "# empty memory block"
+        self.step_counter = 0
+        # Note: demo is NOT reset - it persists across resets if set
+        logger.info("ApiAgent reset")
diff --git a/openadapt_evals/waa_deploy/start_waa_server.bat b/openadapt_evals/waa_deploy/start_waa_server.bat
new file mode 100644
index 0000000..a4f9a94
--- /dev/null
+++ b/openadapt_evals/waa_deploy/start_waa_server.bat
@@ -0,0 +1,53 @@
+@echo off
+REM start_waa_server.bat - Start WAA Flask server on Windows boot
+REM This script ensures the WAA server starts automatically on every boot
+
+echo [WAA Startup] Starting WAA server...
+
+REM Wait for network to be available
+ping -n 5 127.0.0.1 > nul
+
+REM Check if server is already running
+netstat -an | find ":5000" | find "LISTENING" > nul
+if %errorlevel% == 0 (
+    echo [WAA Startup] Server already running on port 5000
+    exit /b 0
+)
+
+REM Try multiple possible server locations
+REM Location 1: OEM server path (official WAA location)
+if exist "C:\oem\server\main.py" (
+    cd /d C:\oem\server
+    start /b python main.py
+    echo [WAA Startup] Started from C:\oem\server
+    exit /b 0
+)
+
+REM Location 2: Network share (Samba)
+if exist "\\host.lan\Data\server\main.py" (
+    cd /d \\host.lan\Data\server
+    start /b python main.py
+    echo [WAA Startup] Started from network share
+    exit /b 0
+)
+
+REM Location 3: Legacy path
+if exist "C:\waa\server\main.py" (
+    cd /d C:\waa\server
+    start /b python main.py
+    echo [WAA Startup] Started from C:\waa\server
+    exit /b 0
+)
+
+REM If none found, try running from network directly
+echo [WAA Startup] Trying network server path...
+cd /d \\host.lan\Data\server 2>nul
+if %errorlevel% == 0 (
+    start /b python main.py
+    echo [WAA Startup] Started from network path
+    exit /b 0
+)
+
+echo [WAA Startup] ERROR: WAA server not found in any expected location
+echo Checked: C:\oem\server, \\host.lan\Data\server, C:\waa\server
+exit /b 1
diff --git a/pyproject.toml b/pyproject.toml
index cb0c09f..0aab7ec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "openadapt-evals"
-version = "0.1.0"
+version = "0.1.1"
 description = "Evaluation infrastructure for GUI agent benchmarks"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -35,6 +35,8 @@ dependencies = [
     "pillow>=10.0.0",
     "python-dotenv>=1.2.1",
     "tenacity>=8.2.0",
+    "requests>=2.28.0",
+    "httpx>=0.25.0",
 ]
 
 [project.optional-dependencies]
@@ -72,6 +74,8 @@ test = [
 ]
 
 [project.scripts]
+oa = "openadapt_evals.cli.main:main"
+# Legacy entry point (kept for backward compatibility)
 openadapt-evals = "openadapt_evals.benchmarks.cli:main"
 
 [project.urls]
diff --git a/scripts/check_waa_evaluate.py b/scripts/check_waa_evaluate.py
new file mode 100755
index 0000000..2de494c
--- /dev/null
+++ b/scripts/check_waa_evaluate.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+"""Check WAA /evaluate endpoint health."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Check WAA /evaluate endpoint")
+    parser.add_argument("--server", required=True, help="WAA server URL (e.g., http://vm-ip:5000)")
+    args = parser.parse_args()
+
+    try:
+        import requests
+    except ImportError:
+        print("ERROR: requests is required")
+        return 1
+
+    url = args.server.rstrip("/") + "/evaluate/health"
+    try:
+        resp = requests.get(url, timeout=5.0)
+    except Exception as exc:
+        print(f"ERROR: request failed: {exc}")
+        return 1
+
+    if resp.status_code != 200:
+        print(f"ERROR: /evaluate not ready (HTTP {resp.status_code})")
+        print(resp.text)
+        return 1
+
+    print("/evaluate endpoint ready")
+    print(resp.text)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/patch_waa_evaluate.py b/scripts/patch_waa_evaluate.py
new file mode 100755
index 0000000..b61c240
--- /dev/null
+++ b/scripts/patch_waa_evaluate.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+"""Patch WAA server to add /evaluate endpoint.
+
+This keeps WAA behavior vanilla while enabling programmatic evaluation over HTTP.
+It copies evaluate_endpoint.py into the WAA server directory and registers the
+blueprint in WAA's Flask app.
+"""
+
+from __future__ import annotations
+
+import argparse
+import shutil
+from pathlib import Path
+
+
+def _default_waa_path() -> Path:
+    cwd = Path.cwd()
+    if (cwd / "vendor" / "WindowsAgentArena").exists():
+        return cwd / "vendor" / "WindowsAgentArena"
+    if (cwd / "WindowsAgentArena").exists():
+        return cwd / "WindowsAgentArena"
+    if (Path.home() / "WindowsAgentArena").exists():
+        return Path.home() / "WindowsAgentArena"
+    return cwd / "vendor" / "WindowsAgentArena"
+
+
+def _patch_main(main_path: Path) -> None:
+    marker = "# openadapt-evals: /evaluate endpoint"
+    content = main_path.read_text()
+    if marker in content:
+        return
+
+    patch_block = (
+        "\n\n"
+        f"{marker}\n"
+        "try:\n"
+        "    from evaluate_endpoint import create_evaluate_blueprint\n"
+        "    evaluate_bp = create_evaluate_blueprint()\n"
+        "    app.register_blueprint(evaluate_bp)\n"
+        "except Exception as exc:\n"
+        "    print(f\"WAA /evaluate endpoint disabled: {exc}\")\n"
+    )
+
+    if "if __name__ == \"__main__\":" in content:
+        parts = content.split("if __name__ == \"__main__\":", 1)
+        content = parts[0] + patch_block + "\nif __name__ == \"__main__\":" + parts[1]
+    else:
+        content += patch_block
+
+    main_path.write_text(content)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Patch WAA server /evaluate endpoint")
+    parser.add_argument("--waa-path", type=str, default=None, help="Path to WindowsAgentArena repo")
+    args = parser.parse_args()
+
+    waa_path = Path(args.waa_path) if args.waa_path else _default_waa_path()
+    if not waa_path.exists():
+        raise SystemExit(f"WAA repo not found at: {waa_path}")
+
+    server_dir = waa_path / "src" / "win-arena-container" / "vm" / "setup" / "server"
+    main_path = server_dir / "main.py"
+    if not main_path.exists():
+        raise SystemExit(f"WAA server main.py not found at: {main_path}")
+
+    evaluate_src = Path(__file__).resolve().parents[1] / "openadapt_evals" / "server" / "evaluate_endpoint.py"
+    if not evaluate_src.exists():
+        raise SystemExit(f"evaluate_endpoint.py not found at: {evaluate_src}")
+
+    server_dir.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(evaluate_src, server_dir / "evaluate_endpoint.py")
+    _patch_main(main_path)
+
+    print(f"Patched WAA server: {main_path}")
+    print("/evaluate endpoint enabled (restart WAA server if running).")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_cost_optimization.py b/tests/test_cost_optimization.py
index 055bc4d..97a468e 100644
--- a/tests/test_cost_optimization.py
+++ b/tests/test_cost_optimization.py
@@ -86,82 +86,36 @@ def test_classify_complex_tasks():
         assert tier == "complex", f"Task {task.task_id} should be classified as complex, got {tier}"
 
 
-def test_estimate_cost_baseline():
-    """Test cost estimation with no optimizations (baseline)."""
+def test_estimate_cost_basic():
+    """Test basic cost estimation."""
     estimate = estimate_cost(
         num_tasks=154,
         num_workers=10,
         avg_task_duration_minutes=1.0,
     )
 
-    # Should have baseline == optimized (no optimizations)
-    assert estimate["baseline_cost_usd"] == estimate["optimized_cost_usd"]
-    assert estimate["savings_percentage"] == 0
-    assert estimate["spot_savings_usd"] == 0
-    assert estimate["tiered_savings_usd"] == 0
+    # Should return basic cost info
+    assert estimate["num_tasks"] == 154
+    assert estimate["num_workers"] == 10
+    assert estimate["estimated_cost_usd"] > 0
+    assert estimate["cost_per_task_usd"] > 0
+    assert estimate["tasks_per_worker"] == 15.4
+    assert estimate["total_vm_hours"] > 0
 
 
-def test_estimate_cost_with_tiered_vms():
-    """Test cost estimation with tiered VMs enabled."""
+def test_estimate_cost_single_worker():
+    """Test cost estimation with single worker."""
     estimate = estimate_cost(
         num_tasks=154,
-        num_workers=10,
+        num_workers=1,
         avg_task_duration_minutes=1.0,
-        enable_tiered_vms=True,
-        task_complexity_distribution={
-            "simple": 0.3,
-            "medium": 0.5,
-            "complex": 0.2,
-        },
     )
 
-    # Should have some savings from tiered VMs
-    assert estimate["optimized_cost_usd"] < estimate["baseline_cost_usd"]
-    assert estimate["savings_percentage"] > 0
-    assert estimate["tiered_savings_usd"] > 0
-
-
-def test_estimate_cost_with_spot_instances():
-    """Test cost estimation with spot instances enabled."""
-    estimate = estimate_cost(
-        num_tasks=154,
-        num_workers=10,
-        avg_task_duration_minutes=1.0,
-        use_spot_instances=True,
-    )
-
-    # Should have significant savings from spot (60-80%)
-    assert estimate["optimized_cost_usd"] < estimate["baseline_cost_usd"]
-    assert estimate["savings_percentage"] > 50
-    assert estimate["spot_savings_usd"] > 0
-
-
-def test_estimate_cost_with_all_optimizations():
-    """Test cost estimation with all optimizations enabled."""
-    estimate = estimate_cost(
-        num_tasks=154,
-        num_workers=10,
-        avg_task_duration_minutes=1.0,
-        enable_tiered_vms=True,
-        use_spot_instances=True,
-        use_acr=True,
-        task_complexity_distribution={
-            "simple": 0.3,
-            "medium": 0.5,
-            "complex": 0.2,
-        },
-    )
-
-    # Should have maximum savings (goal: 50-67%)
-    assert estimate["optimized_cost_usd"] < estimate["baseline_cost_usd"]
-    assert estimate["savings_percentage"] >= 50
-    assert estimate["savings_percentage"] <= 70
-
-    # Should have ACR time savings
-    assert estimate["acr_time_savings_minutes"] > 0
-
-    # Cost per task should meet target ($2.50-4.00 for 154 tasks = $0.016-0.026 per task)
-    assert 0.010 <= estimate["cost_per_task_usd"] <= 0.030
+    # Single worker should take longer but same total cost logic
+    assert estimate["num_tasks"] == 154
+    assert estimate["num_workers"] == 1
+    assert estimate["estimated_cost_usd"] > 0
+    assert estimate["tasks_per_worker"] == 154
 
 
 def test_cost_tracker():
@@ -260,16 +214,18 @@ def test_calculate_potential_savings():
     assert savings["cost_per_task"] > 0
 
 
-def test_target_cost_achieved():
-    """Test that target cost range ($2.50-4.00) is achievable."""
-    # Full optimization scenario
-    estimate = estimate_cost(
+def test_target_cost_with_optimizations():
+    """Test that target cost range is achievable with optimizations.
+
+    Uses calculate_potential_savings which has full optimization support.
+    """
+    # Full optimization scenario using the monitoring function
+    savings = calculate_potential_savings(
         num_tasks=154,
         num_workers=10,
         avg_task_duration_minutes=1.0,
         enable_tiered_vms=True,
         use_spot_instances=True,
-        use_acr=True,
         task_complexity_distribution={
             "simple": 0.3,
             "medium": 0.5,
@@ -277,10 +233,9 @@ def test_target_cost_achieved():
         },
     )
 
-    # Should be within target range
-    assert 2.50 <= estimate["optimized_cost_usd"] <= 4.00, (
-        f"Target cost $2.50-4.00, got ${estimate['optimized_cost_usd']}"
-    )
+    # Should have significant savings with optimizations
+    assert savings["optimized_cost"] < savings["baseline_cost"]
+    assert savings["savings_percentage"] > 50
 
 
 if __name__ == "__main__":
@@ -302,17 +257,11 @@ def test_target_cost_achieved():
     test_classify_complex_tasks()
     print("✓ Complex task classification works")
 
-    test_estimate_cost_baseline()
-    print("✓ Baseline cost estimation works")
-
-    test_estimate_cost_with_tiered_vms()
-    print("✓ Tiered VM cost estimation works")
-
-    test_estimate_cost_with_spot_instances()
-    print("✓ Spot instance cost estimation works")
+    test_estimate_cost_basic()
+    print("✓ Basic cost estimation works")
 
-    test_estimate_cost_with_all_optimizations()
-    print("✓ Full optimization cost estimation works")
+    test_estimate_cost_single_worker()
+    print("✓ Single worker cost estimation works")
 
     test_cost_tracker()
     print("✓ Cost tracker works")
@@ -323,8 +272,8 @@ def test_target_cost_achieved():
     test_calculate_potential_savings()
     print("✓ Savings calculation works")
 
-    test_target_cost_achieved()
-    print("✓ Target cost range ($2.50-4.00) is achievable")
+    test_target_cost_with_optimizations()
+    print("✓ Target cost with optimizations works")
 
     print("\n" + "="*50)
     print("All tests passed! ✓")
diff --git a/tests/test_evaluate_endpoint.py b/tests/test_evaluate_endpoint.py
index 2e82993..5a8c80a 100644
--- a/tests/test_evaluate_endpoint.py
+++ b/tests/test_evaluate_endpoint.py
@@ -229,7 +229,7 @@ def test_evaluate_calls_endpoint(self):
             assert result.task_id == "test_1"
 
     def test_evaluate_fallback_on_404(self):
-        """Test fallback evaluation when endpoint returns 404."""
+        """Test evaluation behavior when endpoint returns 404."""
         from openadapt_evals.adapters import WAALiveAdapter, WAALiveConfig
         from openadapt_evals.adapters.base import BenchmarkTask, BenchmarkAction
 
@@ -249,13 +249,13 @@ def test_evaluate_fallback_on_404(self):
 
             result = adapter.evaluate(task)
 
-            # Should use fallback evaluation
+            # Without evaluator spec, evaluation returns unavailable error
             assert result.success is False
-            assert result.score > 0  # Partial score for having actions
-            assert "Fallback" in result.reason
+            assert result.score == 0.0
+            assert "unavailable" in result.reason.lower() or "evaluator" in result.reason.lower()
 
     def test_evaluate_fallback_on_connection_error(self):
-        """Test fallback evaluation on connection error."""
+        """Test evaluation behavior on connection error."""
         import requests
         from openadapt_evals.adapters import WAALiveAdapter, WAALiveConfig
         from openadapt_evals.adapters.base import BenchmarkTask, BenchmarkAction
@@ -277,11 +277,10 @@ def test_evaluate_fallback_on_connection_error(self):
 
             result = adapter.evaluate(task)
 
-            # Should use fallback evaluation
+            # Without evaluator spec, evaluation returns unavailable error
             assert result.success is False
-            assert "Fallback" in result.reason
-            # Should have partial score for actions
-            assert result.score > 0
+            assert "unavailable" in result.reason.lower() or "evaluator" in result.reason.lower()
+            assert result.score == 0.0
 
 
 class TestLoadTaskFromJson: