From a83ad7b84ddbaf7964f64230788c232d84da2e6d Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 11:51:05 -0500
Subject: [PATCH 01/21] docs: add verified repo consolidation plan

- Two-package architecture: openadapt-evals (foundation) + openadapt-ml (ML)
- Verified audit findings: 10 dead files confirmed, 3 previously marked dead but used
- CLI namespacing: oa evals <cmd>, oa ml <cmd>
- Dependency direction: openadapt-ml depends on openadapt-evals (not circular)
- Agents with ML deps (PolicyAgent, BaselineAgent) move to openadapt-ml
- adapters/waa/ subdirectory pattern for benchmark organization

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 docs/REPO_CONSOLIDATION_PLAN.md | 1068 +++++++++++++++++++++++++++++++
 1 file changed, 1068 insertions(+)
 create mode 100644 docs/REPO_CONSOLIDATION_PLAN.md
diff --git a/docs/REPO_CONSOLIDATION_PLAN.md b/docs/REPO_CONSOLIDATION_PLAN.md
new file mode 100644
index 0000000..5561fd9
--- /dev/null
+++ b/docs/REPO_CONSOLIDATION_PLAN.md
@@ -0,0 +1,1068 @@
+# Repository Consolidation Plan
+
+## Overview
+
+Clean up the existing **two-package architecture** by moving code to the right place:
+
+```
+openadapt-evals       # Foundation: benchmarks + infrastructure (standalone)
+    └── MOVE HERE: VM management, waa_deploy/, session tracking
+    └── Zero ML dependencies
+    └── Supports multiple benchmarks (WAA, OSWorld, WebArena, etc.)
+
+openadapt-ml          # Extension: ML training
+    └── KEEP: training/, vlm/, baselines/, grounding/
+    └── ADD DEPENDENCY: openadapt-evals
+    └── DELETE: duplicate benchmark code
+```
+
+**What this consolidation does:**
+1. Moves VM/benchmark infrastructure from openadapt-ml → openadapt-evals
+2. Deletes ~1000 lines of duplicate code between repos
+3. Establishes proper dependency: openadapt-ml depends on openadapt-evals
+4. Cleans up ~1500 lines of dead code (server patches never deployed)
+
+**Why `openadapt-evals` not `openadapt-waa`?**
+- Avoids repo proliferation (no need for openadapt-osworld, openadapt-webarena, etc.)
+- Single package supports all benchmarks with shared infrastructure
+- Discoverability via README, PyPI keywords, GitHub topics instead of package name
+
+---
+
+## Part 0: Current State (as of Jan 2026)
+
+### openadapt-ml Current Structure
+
+```
+openadapt_ml/
+├── benchmarks/                    # VM + Benchmark code (mostly from PR #14)
+│   ├── cli.py                     # ⭐ PR #14: VM lifecycle CLI (~1300 lines)
+│   │                              #    Commands: create, delete, status, build,
+│   │                              #    start, stop, probe, run, deallocate,
+│   │                              #    logs, exec, docker-exec, vnc, tasks, etc.
+│   ├── waa_deploy/                # ⭐ PR #14: Docker deployment
+│   │   ├── Dockerfile             #    Custom WAA image build
+│   │   ├── api_agent.py           #    Agent running inside container
+│   │   ├── install.bat            #    Windows setup script
+│   │   └── start_waa_server.bat   #    Server startup script
+│   ├── vm_monitor.py              # VM status monitoring
+│   ├── azure_ops_tracker.py       # Azure operation logging
+│   ├── session_tracker.py         # Cost/time tracking
+│   ├── disk_manager.py            # Disk space management
+│   ├── dashboard.py               # Dashboard generation
+│   ├── viewer.py                  # Benchmark results viewer
+│   │
+│   ├── # --- Duplicates (also in openadapt-evals) ---
+│   ├── agent.py                   # → DELETE (use openadapt-evals)
+│   ├── base.py                    # → DELETE (use openadapt-evals)
+│   ├── runner.py                  # → DELETE (use openadapt-evals)
+│   ├── waa.py                     # → DELETE (use openadapt-evals)
+│   ├── waa_live.py                # → DELETE (use openadapt-evals)
+│   ├── data_collection.py         # → DELETE (use openadapt-evals)
+│   ├── live_tracker.py            # → DELETE (use openadapt-evals)
+│   ├── azure.py                   # → DELETE (use openadapt-evals)
+│   └── trace_export.py            # → DELETE (use openadapt-evals)
+│
+├── cloud/                         # Cloud infrastructure
+│   ├── local.py                   # Dashboard server (~3700 lines, 90% benchmark)
+│   ├── ssh_tunnel.py              # SSH tunnel management
+│   ├── lambda_labs.py             # Lambda Labs GPU training
+│   └── azure_inference.py         # Azure ML inference
+│
+├── training/                      # ML Training (KEEP in openadapt-ml)
+│   ├── trainer.py                 # Core trainer
+│   ├── trl_trainer.py             # TRL-based trainer
+│   ├── stub_provider.py           # Mock training for testing
+│   ├── benchmark_viewer.py        # Training benchmark viewer
+│   ├── azure_ops_viewer.py        # Azure ops viewer
+│   ├── shared_ui.py               # Shared UI components
+│   ├── viewer.py                  # Training viewer
+│   └── viewer_components.py       # Viewer components
+│
+├── models/                        # VLM Adapters (KEEP in openadapt-ml)
+│   ├── api_adapter.py             # API-based VLM
+│   ├── base_adapter.py            # Base adapter interface
+│   ├── qwen_vl.py                 # Qwen adapter
+│   ├── dummy_adapter.py           # Testing
+│   └── providers/                 # Provider implementations
+│       ├── anthropic.py
+│       ├── openai.py
+│       └── google.py
+│
+├── baselines/                     # Baseline adapters (KEEP in openadapt-ml)
+│   ├── adapter.py
+│   ├── cli.py
+│   ├── config.py
+│   ├── parser.py
+│   └── prompts.py
+│
+├── grounding/                     # UI grounding (KEEP in openadapt-ml)
+│   ├── base.py
+│   └── detector.py
+│
+├── ingest/                        # Data ingestion (KEEP in openadapt-ml)
+│   ├── capture.py                 # OpenAdapt capture ingestion
+│   ├── loader.py
+│   └── synthetic.py
+│
+├── retrieval/                     # Demo retrieval (KEEP in openadapt-ml)
+│   ├── retriever.py
+│   ├── demo_retriever.py
+│   ├── embeddings.py
+│   └── index.py
+│
+├── experiments/                   # Research experiments (KEEP in openadapt-ml)
+│   ├── demo_prompt/               # Demo-conditioned prompting
+│   ├── representation_shootout/   # Representation experiments
+│   └── waa_demo/                  # WAA demo experiments
+│
+├── segmentation/                  # Workflow segmentation (KEEP in openadapt-ml)
+│   ├── cli.py
+│   ├── pipeline.py
+│   ├── annotator.py
+│   └── ...
+│
+├── runtime/                       # Runtime policy (KEEP in openadapt-ml)
+│   ├── policy.py
+│   └── safety_gate.py
+│
+├── schema/                        # Data schemas
+│   ├── episode.py                 # Episode schema
+│   └── converters.py
+│
+├── evals/                         # Evaluation metrics (KEEP in openadapt-ml)
+│   ├── grounding.py
+│   ├── trajectory_matching.py
+│   └── plot_eval_metrics.py
+│
+├── export/                        # Data export (KEEP in openadapt-ml)
+│   ├── cli.py
+│   └── parquet.py
+│
+├── scripts/                       # CLI scripts (KEEP in openadapt-ml)
+│   ├── train.py
+│   ├── compare.py
+│   ├── capture_screenshots.py
+│   └── ...
+│
+└── config.py                      # Configuration
+```
+
+### openadapt-evals Current Structure
+
+```
+openadapt_evals/
+├── adapters/                      # Benchmark adapters (KEEP in openadapt-evals)
+│   ├── base.py                    # BenchmarkAdapter interface
+│   ├── waa.py                     # WAAMockAdapter
+│   └── waa_live.py                # WAALiveAdapter
+│
+├── agents/                        # Benchmark agents (KEEP in openadapt-evals)
+│   ├── base.py                    # BenchmarkAgent interface
+│   ├── api_agent.py               # Claude/GPT API agent (P0 demo fix)
+│   ├── retrieval_agent.py         # Demo retrieval agent
+│   ├── scripted_agent.py          # Scripted agent for testing
+│   ├── baseline_agent.py          # → MOVE to openadapt-ml (uses VLM)
+│   └── policy_agent.py            # → MOVE to openadapt-ml (uses trained model)
+│
+├── benchmarks/                    # Benchmark framework (KEEP in openadapt-evals)
+│   ├── cli.py                     # Evaluation CLI
+│   ├── runner.py                  # evaluate_agent_on_benchmark()
+│   ├── data_collection.py         # ExecutionTraceCollector
+│   ├── live_tracker.py            # LiveEvaluationTracker
+│   ├── monitoring.py              # Benchmark monitoring
+│   ├── dashboard_server.py        # Dashboard HTTP server
+│   ├── viewer.py                  # Results viewer
+│   ├── config.py                  # Configuration
+│   ├── health_checker.py          # Health checking
+│   ├── auto_screenshot.py         # Screenshot automation
+│   ├── generate_synthetic_demos.py
+│   ├── validate_demos.py
+│   ├── validate_screenshots.py
+│   ├── agent.py                   # → Duplicate
+│   ├── base.py                    # → Duplicate
+│   ├── waa.py                     # → Duplicate
+│   ├── waa_live.py                # → Duplicate
+│   ├── azure.py                   # → Duplicate
+│   └── live_api.py
+│
+├── evaluation/                    # Evaluation framework (KEEP)
+│   ├── client.py                  # → REVIEW (may be dead code)
+│   └── discovery.py               # VM IP auto-discovery (KEEP)
+│
+├── server/                        # Server patches → DELETE (unused)
+│   ├── evaluate_endpoint.py       # → DELETE (never deployed)
+│   └── waa_server_patch.py        # → DELETE (never deployed)
+│
+├── shared_ui/                     # UI components (KEEP)
+│   └── keyboard_shortcuts.py
+│
+├── metrics/                       # Metrics (KEEP)
+│   └── __init__.py
+│
+└── tests/                         # Tests (KEEP)
+    ├── test_api_agent_p0_fix.py
+    ├── test_api_agent_parsing.py
+    ├── test_cost_optimization.py
+    ├── test_evaluate_endpoint.py
+    ├── test_mock_adapter.py
+    ├── test_retrieval_agent.py
+    ├── test_runner.py
+    └── test_synthetic_demos.py
+```
+
+### PR #14 Code Summary
+
+PR #14 (merged Jan 2026) added the VM management CLI to openadapt-ml:
+
+**Files Added/Modified:**
+- `openadapt_ml/benchmarks/cli.py` - ~1300 lines of VM lifecycle commands
+- `openadapt_ml/benchmarks/waa_deploy/Dockerfile` - Custom WAA Docker image
+- `openadapt_ml/benchmarks/waa_deploy/api_agent.py` - Agent inside container
+- `openadapt_ml/benchmarks/waa_deploy/install.bat` - Windows setup
+- `openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat` - Server startup
+
+**CLI Commands (from PR #14):**
+```
+create      - Create Azure VM with nested virtualization
+delete      - Delete VM and ALL associated resources
+status      - Show VM state and IP
+build       - Build WAA image from waa_deploy/Dockerfile
+start       - Start WAA container
+stop        - Stop container
+probe       - Check if WAA server is ready
+run         - Run benchmark tasks
+deallocate  - Stop VM (preserves disk, stops billing)
+logs        - Show WAA status and logs
+exec        - Run command in container
+docker-exec - Run docker command on host
+vnc         - Open VNC viewer
+tasks       - List available tasks
+download    - Download results
+analyze     - Analyze results
+```
+
+**Destination in openadapt-evals:**
+- `cli.py` → `openadapt_evals/cli/vm.py` (merge with existing evals CLI)
+- `waa_deploy/` → `openadapt_evals/waa_deploy/`
+
+---
+
+## Part 0.5: Code Audit Results (VERIFIED 2026-01-28)
+
+> **✅ VERIFIED**: These findings have been confirmed by comprehensive import analysis.
+
+### Audit Methodology
+
+Verified by:
+1. Checking all imports across the codebase (`grep -r "from.*module\|import.*module"`)
+2. Checking exports in `__init__.py` files and `__all__`
+3. Checking CLI command references
+4. Checking test file imports
+
+### Dead Code (VERIFIED - 10 files)
+
+| File | Status | Evidence |
+|------|--------|----------|
+| `benchmarks/agent.py` | ✅ DEAD (deprecated shim) | Deprecation warning, zero imports |
+| `benchmarks/base.py` | ✅ DEAD (deprecated shim) | Deprecation warning, zero imports |
+| `benchmarks/waa.py` | ✅ DEAD (deprecated shim) | Deprecation warning, zero imports |
+| `benchmarks/waa_live.py` | ✅ DEAD (deprecated shim) | Deprecation warning, zero imports |
+| `benchmarks/auto_screenshot.py` | ✅ DEAD | Zero imports, no CLI command |
+| `benchmarks/dashboard_server.py` | ✅ DEAD | Zero imports, no CLI command |
+| `benchmarks/generate_synthetic_demos.py` | ✅ DEAD | Zero imports, no CLI command |
+| `benchmarks/live_api.py` | ✅ DEAD | Zero imports, no CLI command |
+| `benchmarks/validate_demos.py` | ✅ DEAD | Zero imports, no CLI command |
+| `benchmarks/validate_screenshots.py` | ✅ DEAD | Zero imports, no CLI command |
+
+**Total: ~1000 lines of dead code to remove**
+
+### Previously Marked Dead But Actually Used (3 files)
+
+| File | Status | Evidence |
+|------|--------|----------|
+| `agents/baseline_agent.py` | ✅ USED | Lazy-exported in `agents/__init__.py` |
+| `server/waa_server_patch.py` | ✅ USED | Referenced in `scripts/patch_waa_evaluate.py` |
+| `server/evaluate_endpoint.py` | ✅ USED | Exported and tested (100+ tests) |
+
+### Agents Analysis (VERIFIED)
+
+Agents directory split based on ML dependencies:
+
+| Agent | ML Deps | Key Imports | Recommendation |
+|-------|---------|-------------|----------------|
+| `BenchmarkAgent` (base.py) | ❌ None | `abc`, `re`, `dataclasses` | Keep in openadapt-evals |
+| `ScriptedAgent`, `RandomAgent`, `SmartMockAgent` | ❌ None | `random` | Keep in openadapt-evals |
+| `ApiAgent` | ❌ None | `anthropic`, `openai` (API clients only) | Keep in openadapt-evals |
+| `RetrievalAugmentedAgent` | ⚠️ `openadapt_retrieval` | Embedding models | Keep w/ lazy load |
+| `PolicyAgent` | ✅ `openadapt_ml.vlm` | torch, transformers | **MOVE to openadapt-ml** |
+| `BaselineAgent` | ✅ `openadapt_ml.baselines` | torch, transformers | **MOVE to openadapt-ml** |
+
+**Key Insight**: `ApiAgent` does NOT need ML deps - it just wraps hosted API clients (Claude, GPT).
+
+### Duplicates Between Repos (7 file pairs)
+
+These files exist in both openadapt-ml and openadapt-evals:
+
+| openadapt_evals/ | openadapt_ml/benchmarks/ | Notes |
+|------------------|--------------------------|-------|
+| `adapters/base.py` | `base.py` | Core schemas |
+| `adapters/waa.py` | `waa.py` | WAA adapter |
+| `adapters/waa_live.py` | `waa_live.py` | Live adapter |
+| `benchmarks/runner.py` | `runner.py` | Eval loop |
+| `benchmarks/data_collection.py` | `data_collection.py` | Trace saving |
+| `benchmarks/live_tracker.py` | `live_tracker.py` | Progress tracking |
+| `benchmarks/azure.py` | `azure.py` | Azure orchestration |
+
+**Recommendation**: Pick one canonical location, delete the other, update imports.
+
+### Genuine Value-Add (TENTATIVE - 10 files)
+
+These files provide functionality not available elsewhere:
+
+| File | Value | Confidence |
+|------|-------|------------|
+| `agents/api_agent.py` | **P0 demo persistence fix** - critical | High |
+| `agents/retrieval_agent.py` | Demo retrieval feature | High |
+| `agents/scripted_agent.py` | Testing utilities (RandomAgent, SmartMockAgent) | High |
+| `evaluation/discovery.py` | VM IP auto-discovery from multiple sources | High |
+| `benchmarks/cli.py` | Evaluation-focused CLI | High |
+| `benchmarks/config.py` | Task loading utilities | High |
+| `benchmarks/runner.py` | Core evaluation loop | High |
+| `benchmarks/viewer.py` | Results viewer | High |
+| `benchmarks/health_checker.py` | Used by azure.py | Medium |
+| `benchmarks/monitoring.py` | Cost tracking (used by tests) | Medium |
+
+### Revised Migration Recommendation
+
+Based on this audit, the approach is **simpler than originally planned**:
+
+**openadapt-evals already exists** - we're consolidating INTO it, not creating a new repo.
+
+**Move FROM openadapt-ml TO openadapt-evals:**
+- `benchmarks/cli.py` (VM commands) → merge into `openadapt_evals/cli/`
+- `benchmarks/waa_deploy/` → `openadapt_evals/waa_deploy/`
+- `benchmarks/vm_monitor.py` → `openadapt_evals/infrastructure/`
+- `benchmarks/session_tracker.py` → `openadapt_evals/infrastructure/`
+- `cloud/ssh_tunnel.py` → `openadapt_evals/infrastructure/`
+
+**Delete FROM openadapt-evals (VERIFIED):**
+- Deprecated shims (4): `benchmarks/agent.py`, `benchmarks/base.py`, `benchmarks/waa.py`, `benchmarks/waa_live.py`
+- Dead code (6): `auto_screenshot.py`, `dashboard_server.py`, `generate_synthetic_demos.py`, `live_api.py`, `validate_demos.py`, `validate_screenshots.py`
+
+**KEEP in openadapt-evals (previously marked for deletion but actually used):**
+- `server/waa_server_patch.py` - used by `scripts/patch_waa_evaluate.py`
+- `server/evaluate_endpoint.py` - exported and tested
+- `agents/baseline_agent.py` - lazy-exported in public API
+
+**Delete FROM openadapt-ml (duplicates):**
+- `benchmarks/agent.py`, `base.py`, `runner.py`, `waa.py`, `waa_live.py`
+- `benchmarks/data_collection.py`, `live_tracker.py`, `azure.py`
+
+**Move FROM openadapt-evals TO openadapt-ml (FIXES circular dependency):**
+- `agents/policy_agent.py` - currently imports `openadapt_ml.vlm` (circular!)
+- `agents/baseline_agent.py` - currently imports `openadapt_ml.baselines` (circular!)
+- Moving them to openadapt-ml fixes the dependency direction:
+  - Before: evals → ml (wrong, creates circular dep)
+  - After: ml has the agents, depends on evals (correct)
+- Keep backward-compat lazy imports in openadapt-evals (optional, for API compat)
+
+**Keep in openadapt-evals (no ML deps):**
+- `agents/base.py` - abstract interface
+- `agents/api_agent.py` - just API clients (anthropic, openai)
+- `agents/scripted_agent.py` - test agents
+- `agents/retrieval_agent.py` - keep with lazy load for openadapt_retrieval
+
+---
+
+## Part 1: Architecture
+
+### Package Layering
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                      openadapt-ml                           │
+│  ┌─────────────────────────────────────────────────────┐   │
+│  │  Training    │  VLM Inference  │  Policy Agent      │   │
+│  │  Fine-tuning │  Qwen, etc.     │  Trained models    │   │
+│  └─────────────────────────────────────────────────────┘   │
+│                         │                                   │
+│                    depends on                               │
+│                         ▼                                   │
+├─────────────────────────────────────────────────────────────┤
+│                      openadapt-evals                        │
+│  ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐   │
+│  │ Adapters    │ │ Agents      │ │ Evaluation          │   │
+│  │ WAA, OS-    │ │ API (GPT,   │ │ Runner, metrics     │   │
+│  │ World, etc  │ │ Claude)     │ │ Data collection     │   │
+│  └─────────────┘ └─────────────┘ └─────────────────────┘   │
+│  ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐   │
+│  │ Infra       │ │ Dashboard   │ │ CLI                 │   │
+│  │ VM, Docker  │ │ Monitoring  │ │ evals command       │   │
+│  │ SSH, Azure  │ │ Viewers     │ │                     │   │
+│  └─────────────┘ └─────────────┘ └─────────────────────┘   │
+│  ┌─────────────────────────────────────────────────────┐   │
+│  │  Schemas    │  Config       │  Utilities            │   │
+│  └─────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### User Journeys
+
+**Journey 1: Benchmark Researcher (WAA, OSWorld, etc.)**
+```bash
+pip install openadapt-evals
+oa evals vm setup
+oa evals run --agent gpt-4o --tasks 10
+oa evals view --run my_eval
+oa evals vm stop
+```
+- No ML dependencies (no PyTorch, no transformers)
+- Lightweight install
+- Supports multiple benchmarks (WAA, OSWorld, WebArena, etc.)
+
+**Journey 2: ML Engineer (Training + Benchmarks)**
+```bash
+pip install openadapt-ml  # Also installs openadapt-evals
+oa ml train --capture /path/to/recording --goal "Open Notepad"
+oa evals run --agent policy --checkpoint ./model
+oa ml serve  # Dashboard
+```
+- Full ML training capabilities
+- Uses `oa evals` for evaluation
+- Trains custom agents with `oa ml`, evaluates on benchmarks
+
+**CLI Namespacing**: `oa evals <cmd>` for benchmarks, `oa ml <cmd>` for training. Clear ownership of commands.
+
+---
+
+## Part 2: Package Structures
+
+### openadapt-evals (Foundation)
+
+```
+openadapt-evals/
+├── openadapt_evals/
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # BENCHMARK FRAMEWORK
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── schemas/                    # Shared data structures
+│   │   ├── __init__.py
+│   │   ├── actions.py              # BenchmarkAction
+│   │   ├── observations.py         # BenchmarkObservation
+│   │   ├── tasks.py                # BenchmarkTask
+│   │   └── results.py              # BenchmarkResult
+│   │
+│   ├── adapters/                   # Benchmark environment adapters
+│   │   ├── __init__.py
+│   │   ├── base.py                 # BenchmarkAdapter interface
+│   │   └── waa/                    # Windows Agent Arena
+│   │       ├── __init__.py
+│   │       ├── mock.py             # WAAMockAdapter
+│   │       └── live.py             # WAALiveAdapter
+│   │
+│   ├── agents/                     # Benchmark agents
+│   │   ├── __init__.py
+│   │   ├── base.py                 # BenchmarkAgent interface
+│   │   ├── api_agent.py            # Claude/GPT API agent (P0 demo fix)
+│   │   ├── retrieval_agent.py      # Demo retrieval agent
+│   │   ├── scripted_agent.py       # For testing
+│   │   └── random_agent.py         # Baseline
+│   │
+│   ├── evaluation/                 # Evaluation framework
+│   │   ├── __init__.py
+│   │   ├── runner.py               # evaluate_agent_on_benchmark()
+│   │   ├── metrics.py              # compute_metrics()
+│   │   ├── data_collection.py      # ExecutionTraceCollector
+│   │   └── live_tracker.py         # LiveEvaluationTracker
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # INFRASTRUCTURE
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── infrastructure/             # VM & cloud infrastructure
+│   │   ├── __init__.py
+│   │   ├── azure_vm.py             # Azure VM lifecycle
+│   │   ├── vm_monitor.py           # VM status monitoring
+│   │   ├── session_tracker.py      # Cost/time tracking
+│   │   ├── ssh_tunnel.py           # SSH tunnel management
+│   │   ├── disk_manager.py         # Disk management
+│   │   └── docker.py               # Docker management
+│   │
+│   ├── waa_deploy/                 # WAA Docker deployment
+│   │   ├── Dockerfile
+│   │   ├── api_agent.py            # Agent for inside container
+│   │   └── install.bat
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # USER INTERFACE
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── cli/                        # CLI commands
+│   │   ├── __init__.py
+│   │   ├── main.py                 # Entry point: oa evals
+│   │   ├── vm.py                   # oa evals vm <cmd>
+│   │   ├── run.py                  # oa evals run <cmd>
+│   │   ├── view.py                 # oa evals view
+│   │   └── tasks.py                # oa evals tasks
+│   │
+│   ├── dashboard/                  # Monitoring dashboard
+│   │   ├── __init__.py
+│   │   ├── server.py               # HTTP server
+│   │   ├── api.py                  # REST endpoints
+│   │   └── viewers/                # HTML generation
+│   │       ├── benchmark.py
+│   │       └── azure_ops.py
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # UTILITIES
+│   ├── ══════════════════════════════════════════
+│   │
+│   └── config.py                   # Configuration (API keys, Azure, etc.)
+│
+├── tests/
+│   ├── test_adapters.py
+│   ├── test_agents.py
+│   ├── test_runner.py
+│   ├── test_vm.py
+│   └── test_cli.py
+│
+├── docs/
+│   ├── getting_started.md
+│   ├── cli_reference.md
+│   └── vm_setup.md
+│
+├── pyproject.toml
+├── README.md                       # Benchmark-focused marketing
+└── CLAUDE.md
+```
+
+**pyproject.toml (openadapt-evals):**
+```toml
+[project]
+name = "openadapt-evals"
+description = "GUI agent benchmark toolkit - WAA, OSWorld, WebArena evaluations"
+dependencies = [
+    "httpx>=0.25.0",
+    "pydantic>=2.0.0",
+    "pillow>=10.0.0",
+    "azure-cli>=2.50.0",
+    # NO torch, NO transformers, NO heavy ML deps
+]
+
+[project.scripts]
+oa = "openadapt_evals.cli.main:main"  # Provides: oa evals <cmd>
+```
+
+### openadapt-ml (Extension)
+
+```
+openadapt-ml/
+├── openadapt_ml/
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # ML TRAINING
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── training/                   # Model training
+│   │   ├── __init__.py
+│   │   ├── trainer.py              # Core trainer
+│   │   ├── trl_trainer.py          # TRL-based trainer
+│   │   ├── stub_provider.py        # Mock training for testing
+│   │   └── dashboard.py            # Training dashboard generation
+│   │
+│   ├── vlm/                        # VLM inference
+│   │   ├── __init__.py
+│   │   ├── qwen.py                 # Qwen adapter
+│   │   ├── api_adapter.py          # API-based VLM
+│   │   └── base.py
+│   │
+│   ├── baselines/                  # Baseline model adapters
+│   │   ├── __init__.py
+│   │   ├── unified_adapter.py
+│   │   └── providers/
+│   │
+│   ├── grounding/                  # UI element grounding
+│   │   ├── __init__.py
+│   │   └── gemini_grounder.py
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # AGENTS & INTEGRATION
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── agents/                     # ML-specific agents
+│   │   ├── __init__.py
+│   │   ├── policy_agent.py         # Uses trained VLM policy
+│   │   └── baseline_agent.py       # Unified baseline agent
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # CLI EXTENSION
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── cli/                        # Extended CLI
+│   │   ├── __init__.py
+│   │   ├── main.py                 # Entry point: oa ml <cmd>
+│   │   ├── train.py                # oa ml train
+│   │   └── serve.py                # oa ml serve (training dashboard)
+│   │
+│   ├── ══════════════════════════════════════════
+│   ├── # DATA & UTILITIES
+│   ├── ══════════════════════════════════════════
+│   │
+│   ├── ingest/                     # Data ingestion
+│   │   ├── __init__.py
+│   │   └── capture.py              # OpenAdapt capture ingestion
+│   │
+│   ├── cloud/                      # Cloud GPU training
+│   │   ├── __init__.py
+│   │   ├── lambda_labs.py
+│   │   └── azure_ml.py
+│   │
+│   ├── experiments/                # Research experiments
+│   │   ├── demo_prompt/
+│   │   └── waa_demo/
+│   │
+│   └── config.py                   # ML-specific config (extends evals config)
+│
+├── tests/
+│   ├── test_training.py
+│   ├── test_vlm.py
+│   ├── test_policy_agent.py
+│   └── test_cli.py
+│
+├── docs/
+│   ├── training_guide.md
+│   ├── model_development.md
+│   └── cloud_training.md
+│
+├── pyproject.toml
+├── README.md
+└── CLAUDE.md
+```
+
+**pyproject.toml (openadapt-ml):**
+```toml
+[project]
+name = "openadapt-ml"
+description = "ML training toolkit for OpenAdapt GUI automation agents"
+dependencies = [
+    "openadapt-evals>=0.1.0",       # Foundation dependency
+    "torch>=2.0.0",
+    "transformers>=4.40.0",
+    "trl>=0.8.0",
+    "accelerate>=0.27.0",
+    # Heavy ML deps here
+]
+
+# Note: oa entry point is registered by openadapt-evals
+# openadapt-ml extends it by registering additional subcommands
+# Implementation: oa ml <cmd> routes to openadapt_ml.cli
+```
+
+---
+
+## Part 3: CLI Design
+
+CLI uses namespaced subcommands: `oa evals <cmd>` for benchmarks, `oa ml <cmd>` for training.
+
+### oa evals (openadapt-evals)
+
+```bash
+# VM Management
+oa evals vm create         # Create Azure VM
+oa evals vm delete         # Delete VM
+oa evals vm start / stop   # Start/stop VM
+oa evals vm deallocate     # Deallocate (stop billing)
+oa evals vm status         # Show VM status
+oa evals vm setup          # Full setup (Docker + benchmark image)
+oa evals vm probe          # Check benchmark server status
+oa evals vm diag           # Diagnostic info
+oa evals vm logs           # Container logs
+oa evals vm ssh            # Interactive SSH
+oa evals vm vnc            # Open VNC viewer
+
+# Evaluation
+oa evals mock              # Mock evaluation (no VM)
+oa evals live              # Live evaluation against server
+oa evals run               # Shorthand for common evaluation
+
+# Results & Monitoring
+oa evals view              # Generate results viewer
+oa evals dashboard         # Start monitoring dashboard
+oa evals tasks             # List available tasks
+
+# Configuration
+oa evals config            # Show/edit configuration
+oa evals config set KEY VALUE
+```
+
+### oa ml (openadapt-ml)
+
+```bash
+# Training
+oa ml train                # Start training
+oa ml train --capture /path --goal "description"
+oa ml train --config config.yaml
+oa ml train status         # Training status
+oa ml train stop           # Stop training
+
+# ML Dashboard
+oa ml serve                # Serve training dashboard
+oa ml serve --port 8080
+
+# Cloud Training
+oa ml cloud launch         # Launch cloud GPU instance
+oa ml cloud status         # Check cloud training
+oa ml cloud terminate      # Terminate instance
+
+# ML-specific evaluation (uses oa evals under the hood)
+oa evals run --agent policy --checkpoint ./model
+```
+
+---
+
+## Part 4: Migration Steps
+
+### Phase 1: Consolidate into openadapt-evals (Existing Repo)
+
+Since openadapt-evals already exists, we consolidate INTO it rather than creating a new repo.
+
+1. **Restructure openadapt-evals** for multi-benchmark support:
+   - Move `adapters/waa*.py` → `adapters/waa/` (subdirectory per benchmark)
+   - Move `benchmarks/cli.py` → restructure into `cli/`
+   - Move `benchmarks/dashboard_server.py` → `dashboard/`
+   - Create `infrastructure/` directory for VM/cloud code
+2. **Copy from openadapt-ml**:
+   - `benchmarks/cli.py` (VM commands) → `openadapt_evals/cli/vm.py`
+   - `benchmarks/waa_deploy/` → `openadapt_evals/waa_deploy/`
+   - `benchmarks/vm_monitor.py` → `openadapt_evals/infrastructure/vm_monitor.py`
+   - `benchmarks/session_tracker.py` → `openadapt_evals/infrastructure/session_tracker.py`
+   - `benchmarks/azure_ops_tracker.py` → `openadapt_evals/infrastructure/azure_ops_tracker.py`
+   - `cloud/ssh_tunnel.py` → `openadapt_evals/infrastructure/ssh_tunnel.py`
+3. **Clean up dead code** (after verification):
+   - Delete deprecated shims: `benchmarks/agent.py`, `benchmarks/base.py`, etc.
+   - Delete unused server patch: `server/waa_server_patch.py`, `server/evaluate_endpoint.py`
+4. **Write CLI entry point**: `evals`
+5. **Write tests**
+6. **Write README** with multi-benchmark marketing
+
+### Phase 2: Refactor openadapt-ml
+
+1. **Add dependency**: `openadapt-evals>=0.1.0`
+2. **Delete moved code**:
+   - `benchmarks/` (most of it)
+   - `cloud/local.py` (dashboard moved to evals)
+   - `cloud/ssh_tunnel.py` (moved to evals)
+3. **Keep ML-specific code**:
+   - `training/`
+   - `vlm/`
+   - `baselines/`
+   - `grounding/`
+   - `ingest/`
+   - `cloud/lambda_labs.py`, `cloud/azure_ml.py`
+   - `experiments/`
+4. **Add ML-specific agents**:
+   - `agents/policy_agent.py`
+   - `agents/baseline_agent.py`
+5. **Create extended CLI**: `oa` that imports from evals and adds training
+6. **Update imports** to use `openadapt_evals`
+7. **Update tests**
+
+### Phase 3: Update Documentation
+
+1. **Update openadapt-evals README**: Multi-benchmark focus
+   - "GUI agent benchmark toolkit - WAA, OSWorld, WebArena evaluations"
+2. **Update openadapt-ml README**: Training focus
+   - Links to openadapt-evals for evaluation
+3. **Update CLAUDE.md** in both repos
+
+### Phase 4: Publishing & Marketing
+
+1. **openadapt-evals README**: Multi-benchmark-focused
+   - "GUI agent benchmark toolkit - WAA, OSWorld, WebArena evaluations"
+   - One-liner install
+   - Quick start examples for each supported benchmark
+2. **openadapt-ml README**: Training-focused
+   - "Train custom GUI automation agents"
+   - Links to openadapt-evals for evaluation
+3. **PyPI publishing**: Publish both packages
+4. **Update main OpenAdapt docs** to reference both
+
+---
+
+## Part 5: File Mapping (Detailed)
+
+### openadapt-evals Internal Restructuring
+
+These files stay in openadapt-evals but may be reorganized:
+
+| Current Location | New Location | Notes |
+|------------------|--------------|-------|
+| **Adapters** | | |
+| `adapters/base.py` | `adapters/base.py` | BenchmarkAdapter interface (keep) |
+| `adapters/waa.py` | `adapters/waa/mock.py` | WAAMockAdapter |
+| `adapters/waa_live.py` | `adapters/waa/live.py` | WAALiveAdapter |
+| **Agents** | | |
+| `agents/base.py` | `agents/base.py` | BenchmarkAgent interface (keep) |
+| `agents/api_agent.py` | `agents/api_agent.py` | Claude/GPT agent (P0 demo fix) |
+| `agents/retrieval_agent.py` | `agents/retrieval_agent.py` | Demo retrieval |
+| `agents/scripted_agent.py` | `agents/scripted_agent.py` | For testing |
+| `agents/baseline_agent.py` | → MOVE to openadapt-ml | Uses VLM (ML dep) |
+| `agents/policy_agent.py` | → MOVE to openadapt-ml | Uses trained model (ML dep) |
+| **Evaluation** | | |
+| `benchmarks/runner.py` | `evaluation/runner.py` | Core evaluation |
+| `benchmarks/data_collection.py` | `evaluation/data_collection.py` | Trace collector |
+| `benchmarks/live_tracker.py` | `evaluation/live_tracker.py` | Live tracking |
+| `benchmarks/monitoring.py` | `evaluation/monitoring.py` | Monitoring |
+| `benchmarks/health_checker.py` | `evaluation/health_checker.py` | Health checks |
+| `evaluation/client.py` | `evaluation/client.py` | Eval client |
+| `evaluation/discovery.py` | `evaluation/discovery.py` | Service discovery |
+| **CLI** | | |
+| `benchmarks/cli.py` | `cli/eval.py` | Evaluation commands |
+| **Dashboard** | | |
+| `benchmarks/dashboard_server.py` | `dashboard/server.py` | HTTP server |
+| `benchmarks/viewer.py` | `dashboard/viewer.py` | Results viewer |
+| **Config** | | |
+| `benchmarks/config.py` | `config.py` | Configuration |
+| **Delete (dead code)** | | |
+| `server/evaluate_endpoint.py` | DELETE | Never deployed |
+| `server/waa_server_patch.py` | DELETE | Never deployed |
+| `benchmarks/auto_screenshot.py` | DELETE | Never imported |
+| `benchmarks/generate_synthetic_demos.py` | DELETE | Never imported |
+| `benchmarks/validate_demos.py` | DELETE | Never imported |
+| `benchmarks/validate_screenshots.py` | DELETE | Never imported |
+| `benchmarks/live_api.py` | DELETE | Never imported |
+| **Delete (duplicates)** | | |
+| `benchmarks/agent.py` | DELETE | Duplicate shim |
+| `benchmarks/base.py` | DELETE | Duplicate shim |
+| `benchmarks/waa.py` | DELETE | Duplicate shim |
+| `benchmarks/waa_live.py` | DELETE | Duplicate shim |
+| `benchmarks/azure.py` | DELETE | Duplicate |
+| **UI Components** | | |
+| `shared_ui/keyboard_shortcuts.py` | `shared_ui/keyboard_shortcuts.py` | UI shortcuts |
+| **Tests** | | |
+| `tests/test_api_agent_*.py` | `tests/test_api_agent_*.py` | Agent tests |
+| `tests/test_runner.py` | `tests/test_runner.py` | Runner tests |
+| `tests/test_mock_adapter.py` | `tests/test_mock_adapter.py` | Adapter tests |
+| `tests/test_retrieval_agent.py` | `tests/test_retrieval_agent.py` | Retrieval tests |
+
+### From openadapt-ml → openadapt-evals
+
+| Source (openadapt_ml/) | Destination (openadapt_evals/) | Notes |
+|------------------------|--------------------------------|-------|
+| **PR #14 Code** | | |
+| `benchmarks/cli.py` | `cli/vm.py` | ⭐ VM lifecycle commands (1300 lines) |
+| `benchmarks/waa_deploy/` | `waa_deploy/` | ⭐ Docker deployment files |
+| `benchmarks/waa_deploy/Dockerfile` | `waa_deploy/Dockerfile` | WAA image build |
+| `benchmarks/waa_deploy/api_agent.py` | `waa_deploy/api_agent.py` | In-container agent |
+| `benchmarks/waa_deploy/install.bat` | `waa_deploy/install.bat` | Windows setup |
+| `benchmarks/waa_deploy/start_waa_server.bat` | `waa_deploy/start_waa_server.bat` | Server startup |
+| **Infrastructure** | | |
+| `benchmarks/vm_monitor.py` | `infrastructure/vm_monitor.py` | VM status monitoring |
+| `benchmarks/session_tracker.py` | `infrastructure/session_tracker.py` | Cost/time tracking |
+| `benchmarks/azure_ops_tracker.py` | `infrastructure/azure_ops_tracker.py` | Azure op logging |
+| `benchmarks/disk_manager.py` | `infrastructure/disk_manager.py` | Disk management |
+| `benchmarks/dashboard.py` | `dashboard/panels.py` | Dashboard panels |
+| `cloud/ssh_tunnel.py` | `infrastructure/ssh_tunnel.py` | SSH tunnels |
+| **Dashboard Server** | | |
+| `cloud/local.py` (partial) | `dashboard/server.py` | ~90% is benchmark (extract) |
+| | | Training parts stay in openadapt-ml |
+| **Viewers** | | |
+| `benchmarks/viewer.py` | `dashboard/benchmark_viewer.py` | Benchmark viewer |
+| `training/azure_ops_viewer.py` | `dashboard/azure_ops_viewer.py` | Azure ops viewer |
+| **Skip (Duplicates - already in openadapt-evals)** | | |
+| `benchmarks/agent.py` | Skip | Already in openadapt-evals |
+| `benchmarks/base.py` | Skip | Already in openadapt-evals |
+| `benchmarks/runner.py` | Skip | Already in openadapt-evals |
+| `benchmarks/waa.py` | Skip | Already in openadapt-evals |
+| `benchmarks/waa_live.py` | Skip | Already in openadapt-evals |
+| `benchmarks/data_collection.py` | Skip | Already in openadapt-evals |
+| `benchmarks/live_tracker.py` | Skip | Already in openadapt-evals |
+| `benchmarks/azure.py` | Skip | Already in openadapt-evals |
+| `benchmarks/trace_export.py` | Skip | Not needed |
+
+### Stays in openadapt-ml (After Migration)
+
+| Directory | Contents | Notes |
+|-----------|----------|-------|
+| `training/` | trainer.py, trl_trainer.py, stub_provider.py, etc. | Core ML training |
+| `models/` | api_adapter.py, qwen_vl.py, providers/ | VLM inference |
+| `baselines/` | adapter.py, cli.py, config.py, etc. | Baseline models |
+| `grounding/` | base.py, detector.py | UI grounding |
+| `ingest/` | capture.py, loader.py, synthetic.py | Data ingestion |
+| `retrieval/` | retriever.py, demo_retriever.py, etc. | Demo retrieval |
+| `experiments/` | demo_prompt/, waa_demo/, etc. | Research |
+| `segmentation/` | cli.py, pipeline.py, etc. | Workflow segmentation |
+| `runtime/` | policy.py, safety_gate.py | Runtime policy |
+| `evals/` | grounding.py, trajectory_matching.py | Eval metrics |
+| `export/` | cli.py, parquet.py | Data export |
+| `scripts/` | train.py, compare.py, etc. | CLI scripts |
+| `schema/` | episode.py, converters.py | OR move to openadapt-evals |
+| `cloud/lambda_labs.py` | GPU training | Keep |
+| `cloud/azure_inference.py` | Azure ML | Keep |
+| `config.py` | Configuration | Extend openadapt_evals.config |
+
+### New Files in openadapt-ml (After Migration)
+
+| File | Purpose |
+|------|---------|
+| `agents/policy_agent.py` | Move from openadapt-evals (ML dep) |
+| `agents/baseline_agent.py` | Move from openadapt-evals (ML dep) |
+| `cli/main.py` | `oa` CLI entry point (extends `evals`) |
+| `cli/train.py` | Training commands |
+| `cli/serve.py` | Training dashboard server |
+
+### Delete from openadapt-ml (After Migration)
+
+| File | Reason |
+|------|--------|
+| `benchmarks/` (entire directory) | Moved to openadapt-evals |
+| `cloud/local.py` | Dashboard parts moved to openadapt-evals |
+| `cloud/ssh_tunnel.py` | Moved to openadapt-evals |
+| `training/azure_ops_viewer.py` | Moved to openadapt-evals |
+| `training/benchmark_viewer.py` | Moved to openadapt-evals |
+
+---
+
+## Part 6: Effort Estimate
+
+| Phase | Tasks | Effort |
+|-------|-------|--------|
+| 1. Restructure openadapt-evals | Reorganize files, create cli/, infrastructure/ | 3-4 hrs |
+| 2. Copy VM code from openadapt-ml | Move PR #14 code to evals | 2-3 hrs |
+| 3. Write evals CLI | Entry point, subcommands | 2-3 hrs |
+| 4. Clean up dead code | Delete unused files (after verification) | 1-2 hrs |
+| 5. Refactor openadapt-ml | Delete moved code, add dependency | 2-3 hrs |
+| 6. Write oa CLI extension | Extends evals, adds training | 1-2 hrs |
+| 7. Update tests | Fix imports in both repos | 2-3 hrs |
+| 8. Documentation | READMEs, CLAUDE.md, docs | 2-3 hrs |
+
+**Total: ~16-22 hours (2-3 days)**
+
+---
+
+## Part 7: Success Criteria
+
+### openadapt-evals
+
+- [ ] `pip install openadapt-evals` works
+- [ ] `oa evals --help` shows all commands
+- [ ] `oa evals vm status` works (no ML deps imported)
+- [ ] `oa evals mock --tasks 5` works
+- [ ] `oa evals run --agent gpt-4o` works (with VM running)
+- [ ] All tests pass
+- [ ] No PyTorch/transformers in dependencies
+- [ ] README has multi-benchmark quick start (WAA, OSWorld, WebArena)
+
+### openadapt-ml
+
+- [ ] `pip install openadapt-ml` installs openadapt-evals too
+- [ ] `oa ml --help` shows training commands
+- [ ] `oa ml train --help` works
+- [ ] `oa evals run --agent policy` works with trained model
+- [ ] All tests pass
+- [ ] Imports from openadapt_evals work correctly
+- [ ] Dependency direction: openadapt-ml → openadapt-evals (not circular)
+
+---
+
+## Part 8: Marketing Positioning
+
+### openadapt-evals
+
+**Tagline**: "GUI agent benchmark toolkit - evaluate agents on WAA, OSWorld, WebArena"
+
+**README opener**:
+```markdown
+# openadapt-evals
+
+The easiest way to run GUI agent benchmarks.
+
+## Quick Start
+
+```bash
+pip install openadapt-evals
+oa evals vm setup                    # One-time Azure VM setup
+oa evals run --agent gpt-4o --tasks 10
+oa evals view                        # See results
+```
+
+No ML dependencies. No complex setup. Just benchmarks.
+
+## Supported Benchmarks
+
+- **Windows Agent Arena (WAA)** - 154 Windows desktop tasks
+- **OSWorld** - Cross-platform desktop (coming soon)
+- **WebArena/VisualWebArena** - Browser tasks (coming soon)
+```
+
+**Target audience**: Researchers evaluating agents, teams benchmarking LLM capabilities
+
+### openadapt-ml
+
+**Tagline**: "Train custom GUI automation agents"
+
+**README opener**:
+```markdown
+# openadapt-ml
+
+Train and fine-tune VLMs for GUI automation. Built on openadapt-evals.
+
+## Quick Start
+
+```bash
+pip install openadapt-ml
+oa ml train --capture ./recording --goal "Open Notepad and type Hello"
+oa evals run --agent policy --checkpoint ./model
+```
+
+Full ML training pipeline with benchmark evaluation built in.
+```
+
+**Target audience**: ML engineers building GUI agents, researchers training custom models
+
+---
+
+## Part 9: Future Considerations
+
+### Adding New Benchmarks
+
+To add a new benchmark (e.g., OSWorld, WebArena):
+
+1. Create adapter in `openadapt_evals/adapters/{benchmark}/`
+2. Add CLI commands in `openadapt_evals/cli/{benchmark}.py`
+3. Add VM/container setup if needed in `infrastructure/`
+4. Update README with benchmark-specific quick start
+
+No new repos needed - openadapt-evals supports all benchmarks.
+
+### If We Split Again
+
+The two-package structure is already clean. If further splitting needed:
+
+- **openadapt-evals-azure**: Azure-specific infrastructure (for non-Azure users)
+- **openadapt-evals-local**: Local-only running (Docker on local machine)
+
+### Integration with Main OpenAdapt
+
+```
+OpenAdapt (main)          # Capture/recording
+    ↓ recordings
+openadapt-ml              # Training
+    ↓ trained models
+openadapt-evals           # Evaluation
+    ↓ benchmark results
+```
+
+The full pipeline: Capture → Train → Evaluate
+
+### openadapt-viewer Integration
+
+Both packages can use openadapt-viewer for HTML generation:
+```toml
+# Optional dependency
+[project.optional-dependencies]
+viewer = ["openadapt-viewer>=0.1.0"]
+```

From 1ee037c5d0aa061be2aa1d7d2770aee73e48585d Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 12:16:14 -0500
Subject: [PATCH 02/21] feat: add openadapt-evals as optional dependency

Add [benchmarks] optional dependency for benchmark evaluation:
- pip install openadapt-ml[benchmarks]

This is part of the repo consolidation to establish:
- openadapt-evals: Foundation for benchmarks + infrastructure
- openadapt-ml: ML training (depends on evals for benchmarks)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pyproject.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index a26039d..2f907fd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,6 +68,10 @@ dev = [
     "pytest>=9.0.0",
     "ruff>=0.1.0",
 ]
+# Benchmark evaluation (depends on openadapt-evals)
+benchmarks = [
+    "openadapt-evals>=0.1.0",
+]
 
 [project.urls]
 Homepage = "https://github.com/OpenAdaptAI/openadapt-ml"

From 0da88cb690f950f93ea4082a2f1b16b0028ccdfc Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 12:21:37 -0500
Subject: [PATCH 03/21] docs(cli): clarify serve vs dashboard command naming

- oa ml serve: serve trained models for inference
- oa ml dashboard: training dashboard for monitoring

This distinguishes the two use cases clearly:
- serve = model inference endpoint
- dashboard = training progress UI

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 docs/REPO_CONSOLIDATION_PLAN.md | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/docs/REPO_CONSOLIDATION_PLAN.md b/docs/REPO_CONSOLIDATION_PLAN.md
index 5561fd9..90af320 100644
--- a/docs/REPO_CONSOLIDATION_PLAN.md
+++ b/docs/REPO_CONSOLIDATION_PLAN.md
@@ -426,7 +426,8 @@ oa evals vm stop
 pip install openadapt-ml  # Also installs openadapt-evals
 oa ml train --capture /path/to/recording --goal "Open Notepad"
 oa evals run --agent policy --checkpoint ./model
-oa ml serve  # Dashboard
+oa ml serve --checkpoint ./model  # Serve model for inference
+oa ml dashboard  # Training dashboard
 ```
 - Full ML training capabilities
 - Uses `oa evals` for evaluation
@@ -605,7 +606,8 @@ openadapt-ml/
 │   │   ├── __init__.py
 │   │   ├── main.py                 # Entry point: oa ml <cmd>
 │   │   ├── train.py                # oa ml train
-│   │   └── serve.py                # oa ml serve (training dashboard)
+│   │   ├── serve.py                # oa ml serve (model inference server)
+│   │   └── dashboard.py            # oa ml dashboard (training dashboard)
 │   │
 │   ├── ══════════════════════════════════════════
 │   ├── # DATA & UTILITIES
@@ -708,9 +710,14 @@ oa ml train --config config.yaml
 oa ml train status         # Training status
 oa ml train stop           # Stop training
 
-# ML Dashboard
-oa ml serve                # Serve training dashboard
-oa ml serve --port 8080
+# Model Serving (Inference)
+oa ml serve                       # Serve trained model for inference
+oa ml serve --checkpoint ./model  # Serve specific checkpoint
+oa ml serve --port 8080           # Custom port
+
+# Training Dashboard
+oa ml dashboard            # Start training dashboard
+oa ml dashboard --port 8080
 
 # Cloud Training
 oa ml cloud launch         # Launch cloud GPU instance
@@ -912,7 +919,8 @@ These files stay in openadapt-evals but may be reorganized:
 | `agents/baseline_agent.py` | Move from openadapt-evals (ML dep) |
 | `cli/main.py` | `oa` CLI entry point (extends `evals`) |
 | `cli/train.py` | Training commands |
-| `cli/serve.py` | Training dashboard server |
+| `cli/serve.py` | Model inference server |
+| `cli/dashboard.py` | Training dashboard |
 
 ### Delete from openadapt-ml (After Migration)
 

From f7c9fe489214149f7f0917bb2fe8aab2064ad7a5 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 12:58:16 -0500
Subject: [PATCH 04/21] refactor(benchmarks): consolidate to re-export from
 openadapt-evals

Migrate benchmark infrastructure to two-package architecture:
- openadapt-evals: Foundation package with all adapters, agents, runner
- openadapt-ml: ML-specific agents that wrap openadapt-ml internals

Changes:
- Convert base.py, waa.py, waa_live.py, runner.py, data_collection.py,
  live_tracker.py to deprecation stubs that re-export from openadapt-evals
- Keep only ML-specific agents in agent.py: PolicyAgent, APIBenchmarkAgent,
  UnifiedBaselineAgent
- Update __init__.py to import from openadapt-evals with deprecation warning
- Update tests to import from correct locations
- Remove test_waa_live.py (tests belong in openadapt-evals)

Net: -3540 lines of duplicate code removed

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 openadapt_ml/benchmarks/__init__.py        | 180 ++---
 openadapt_ml/benchmarks/agent.py           | 654 ++--------------
 openadapt_ml/benchmarks/base.py            | 398 +---------
 openadapt_ml/benchmarks/data_collection.py | 456 +----------
 openadapt_ml/benchmarks/live_tracker.py    | 198 +----
 openadapt_ml/benchmarks/runner.py          | 450 +----------
 openadapt_ml/benchmarks/waa.py             | 845 +--------------------
 openadapt_ml/benchmarks/waa_live.py        | 635 +---------------
 tests/benchmarks/test_waa.py               |  14 +-
 tests/integration/test_data_collection.py  |  10 +-
 tests/test_waa_live.py                     | 314 --------
 11 files changed, 300 insertions(+), 3854 deletions(-)
 delete mode 100644 tests/test_waa_live.py

diff --git a/openadapt_ml/benchmarks/__init__.py b/openadapt_ml/benchmarks/__init__.py
index f3e40e3..3c92000 100644
--- a/openadapt_ml/benchmarks/__init__.py
+++ b/openadapt_ml/benchmarks/__init__.py
@@ -1,98 +1,41 @@
 """Benchmark integration for openadapt-ml.
 
-DEPRECATION NOTICE:
-    The canonical benchmark code is now in the `openadapt-evals` package.
-    For new projects, prefer importing from `openadapt_evals`:
+This module provides benchmark evaluation capabilities by re-exporting from
+`openadapt-evals` (the canonical benchmark package) plus ML-specific agents
+that wrap openadapt-ml internals.
 
+For standalone benchmark evaluation (no ML training), use openadapt-evals:
     ```python
-    # Preferred (standalone, no openadapt-ml dependency)
     from openadapt_evals import ApiAgent, WAAMockAdapter, evaluate_agent_on_benchmark
-
-    # Still supported (uses openadapt-ml internals)
-    from openadapt_ml.benchmarks import PolicyAgent, APIBenchmarkAgent
     ```
 
-    The following are ONLY available in openadapt-ml (they wrap openadapt-ml internals):
-    - PolicyAgent (wraps openadapt_ml.runtime.policy.AgentPolicy)
-    - APIBenchmarkAgent (wraps openadapt_ml.models.api_adapter.ApiVLMAdapter)
-
-    The following should be imported from openadapt-evals:
-    - ApiAgent (standalone, P0 demo persistence fix)
-    - All adapter classes (WAAAdapter, WAALiveAdapter, etc.)
-    - Base classes (BenchmarkAdapter, BenchmarkTask, etc.)
-    - Evaluation utilities (evaluate_agent_on_benchmark, compute_metrics)
-
-This module provides interfaces and utilities for evaluating GUI agents
-on standardized benchmarks like Windows Agent Arena (WAA), OSWorld,
-WebArena, and others.
-
-Core classes:
-    - BenchmarkAdapter: Abstract interface for benchmark integration
-    - BenchmarkAgent: Abstract interface for agents to be evaluated
-    - BenchmarkTask, BenchmarkObservation, BenchmarkAction: Data classes
-
-Agent implementations:
-    - PolicyAgent: Wraps openadapt-ml AgentPolicy
-    - APIBenchmarkAgent: Uses hosted VLM APIs (Claude, GPT-5.1) via openadapt-ml adapters
-    - ScriptedAgent: Follows predefined action sequence
-    - RandomAgent: Takes random actions (baseline)
-
-Evaluation:
-    - evaluate_agent_on_benchmark: Run agent on benchmark tasks
-    - compute_metrics: Compute aggregate metrics from results
-
-Example:
+For ML-specific agents that use trained models:
     ```python
-    from openadapt_ml.benchmarks import (
-        BenchmarkAdapter,
-        PolicyAgent,
-        APIBenchmarkAgent,
-        evaluate_agent_on_benchmark,
-        compute_metrics,
-    )
-
-    # Create adapter for specific benchmark
-    adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
-
-    # Wrap policy as benchmark agent
-    agent = PolicyAgent(policy)
-
-    # Or use API-backed agent for baselines
-    agent = APIBenchmarkAgent(provider="anthropic")  # Claude
-    agent = APIBenchmarkAgent(provider="openai")     # GPT-5.1
-
-    # Run evaluation
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
-
-    # Compute metrics
-    metrics = compute_metrics(results)
-    print(f"Success rate: {metrics['success_rate']:.1%}")
+    from openadapt_ml.benchmarks import PolicyAgent, APIBenchmarkAgent
     ```
+
+ML-specific agents (only available in openadapt-ml):
+    - PolicyAgent: Wraps openadapt_ml.runtime.policy.AgentPolicy
+    - APIBenchmarkAgent: Uses openadapt_ml.models.api_adapter.ApiVLMAdapter
+    - UnifiedBaselineAgent: Uses openadapt_ml.baselines adapters
 """
 
 import warnings
 
-# Emit deprecation warning on import
+# Emit deprecation warning for users still importing base classes from here
 warnings.warn(
-    "openadapt_ml.benchmarks is deprecated. "
-    "Please use openadapt_evals for standalone benchmark evaluation. "
-    "See CLAUDE.md for migration guide.",
+    "For standalone benchmark evaluation, prefer importing from openadapt_evals directly. "
+    "openadapt_ml.benchmarks re-exports from openadapt_evals for backward compatibility.",
     DeprecationWarning,
     stacklevel=2,
 )
 
 # ruff: noqa: E402
 # Imports after warning call are intentional
-from openadapt_ml.benchmarks.agent import (
-    APIBenchmarkAgent,
-    BenchmarkAgent,
-    PolicyAgent,
-    RandomAgent,
-    ScriptedAgent,
-    SmartMockAgent,
-    UnifiedBaselineAgent,
-)
-from openadapt_ml.benchmarks.base import (
+
+# Re-export base classes from openadapt-evals (canonical location)
+from openadapt_evals import (
+    # Base classes
     BenchmarkAction,
     BenchmarkAdapter,
     BenchmarkObservation,
@@ -100,31 +43,51 @@
     BenchmarkTask,
     StaticDatasetAdapter,
     UIElement,
-)
-from openadapt_ml.benchmarks.runner import (
+    # Base agent interface
+    BenchmarkAgent,
+    # Test/mock agents (no ML deps)
+    RandomAgent,
+    ScriptedAgent,
+    SmartMockAgent,
+    # Standalone API agent (P0 demo persistence fix)
+    ApiAgent,
+    # Evaluation utilities
     EvaluationConfig,
     compute_domain_metrics,
     compute_metrics,
     evaluate_agent_on_benchmark,
+    # WAA adapters
+    WAAAdapter,
+    WAAConfig,
+    WAAMockAdapter,
+    WAALiveAdapter,
+    WAALiveConfig,
+    # Viewer
+    generate_benchmark_viewer,
+    # Data collection
+    ExecutionTraceCollector,
+    LiveEvaluationTracker,
+    save_execution_trace,
 )
-from openadapt_ml.benchmarks.waa import WAAAdapter, WAAConfig, WAAMockAdapter
-from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
-from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer
 
+# ML-specific agents (only available in openadapt-ml)
+from openadapt_ml.benchmarks.agent import (
+    APIBenchmarkAgent,
+    PolicyAgent,
+    UnifiedBaselineAgent,
+)
 
-# Azure orchestration (lazy import to avoid requiring azure-ai-ml)
-def _get_azure_classes():
-    from openadapt_ml.benchmarks.azure import (
-        AzureConfig,
-        AzureWAAOrchestrator,
-        estimate_cost,
-    )
 
-    return AzureConfig, AzureWAAOrchestrator, estimate_cost
+# Lazy import for Azure classes (avoids requiring azure-ai-ml for basic usage)
+def __getattr__(name: str):
+    if name in ("AzureConfig", "AzureWAAOrchestrator", "estimate_cost"):
+        from openadapt_evals.benchmarks import azure
+        return getattr(azure, name)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
 __all__ = [
-    # Base classes
+    # Base classes (from openadapt-evals)
     "BenchmarkAdapter",
     "BenchmarkTask",
     "BenchmarkObservation",
@@ -132,46 +95,35 @@ def _get_azure_classes():
     "BenchmarkResult",
     "StaticDatasetAdapter",
     "UIElement",
-    # Agents
+    # Agents (from openadapt-evals)
     "BenchmarkAgent",
-    "PolicyAgent",
-    "APIBenchmarkAgent",
-    "UnifiedBaselineAgent",
     "ScriptedAgent",
     "RandomAgent",
     "SmartMockAgent",
-    # Evaluation
+    "ApiAgent",
+    # ML-specific agents (openadapt-ml only)
+    "PolicyAgent",
+    "APIBenchmarkAgent",
+    "UnifiedBaselineAgent",
+    # Evaluation (from openadapt-evals)
     "EvaluationConfig",
     "evaluate_agent_on_benchmark",
     "compute_metrics",
     "compute_domain_metrics",
-    # WAA
+    # WAA (from openadapt-evals)
     "WAAAdapter",
     "WAAConfig",
     "WAAMockAdapter",
     "WAALiveAdapter",
     "WAALiveConfig",
-    # Viewer
+    # Viewer (from openadapt-evals)
     "generate_benchmark_viewer",
-    # Azure (lazy-loaded)
+    # Data collection (from openadapt-evals)
+    "ExecutionTraceCollector",
+    "LiveEvaluationTracker",
+    "save_execution_trace",
+    # Azure (lazy-loaded from openadapt-evals)
     "AzureConfig",
     "AzureWAAOrchestrator",
     "estimate_cost",
 ]
-
-
-# Lazy loading for Azure classes (avoids requiring azure-ai-ml for basic usage)
-def __getattr__(name: str):
-    if name in ("AzureConfig", "AzureWAAOrchestrator", "estimate_cost"):
-        from openadapt_ml.benchmarks.azure import (
-            AzureConfig,
-            AzureWAAOrchestrator,
-            estimate_cost,
-        )
-
-        return {
-            "AzureConfig": AzureConfig,
-            "AzureWAAOrchestrator": AzureWAAOrchestrator,
-            "estimate_cost": estimate_cost,
-        }[name]
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/openadapt_ml/benchmarks/agent.py b/openadapt_ml/benchmarks/agent.py
index aea4d9b..9d4b027 100644
--- a/openadapt_ml/benchmarks/agent.py
+++ b/openadapt_ml/benchmarks/agent.py
@@ -1,8 +1,15 @@
-"""Agent interface for benchmark evaluation.
+"""ML-specific agents for benchmark evaluation.
 
-This module provides the BenchmarkAgent interface that agents must implement
-to be evaluated on benchmarks, plus adapters to wrap existing openadapt-ml
-components.
+This module provides agents that wrap openadapt-ml components (VLM adapters,
+policies, baselines) for benchmark evaluation.
+
+For standalone agents without ML dependencies, use openadapt_evals:
+    from openadapt_evals import ApiAgent, ScriptedAgent, RandomAgent
+
+ML-specific agents in this module:
+    - PolicyAgent: Wraps openadapt_ml.runtime.policy.AgentPolicy
+    - APIBenchmarkAgent: Uses openadapt_ml.models.api_adapter.ApiVLMAdapter
+    - UnifiedBaselineAgent: Uses openadapt_ml.baselines adapters
 
 Example:
     from openadapt_ml.benchmarks import PolicyAgent
@@ -12,7 +19,7 @@
     agent = PolicyAgent(policy)
     results = evaluate_agent_on_benchmark(agent, benchmark_adapter)
 
-    # API-backed agents (GPT-5.1, Claude)
+    # API-backed agents (GPT-5.1, Claude) using openadapt-ml adapters
     from openadapt_ml.benchmarks import APIBenchmarkAgent
 
     agent = APIBenchmarkAgent(provider="anthropic")  # Uses Claude
@@ -23,11 +30,12 @@
 from __future__ import annotations
 
 import re
-from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any
 
-from openadapt_ml.benchmarks.base import (
+# Import base classes from openadapt-evals (canonical location)
+from openadapt_evals import (
     BenchmarkAction,
+    BenchmarkAgent,
     BenchmarkObservation,
     BenchmarkTask,
 )
@@ -38,42 +46,6 @@
     from openadapt_ml.schema import Action
 
 
-class BenchmarkAgent(ABC):
-    """Abstract interface for agents evaluated on benchmarks.
-
-    Agents must implement the `act` method to receive observations
-    and return actions. The agent can maintain internal state across
-    steps within an episode.
-    """
-
-    @abstractmethod
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Given observation and task, return next action.
-
-        Args:
-            observation: Current observation from the environment.
-            task: Task being performed.
-            history: Optional list of previous (observation, action) pairs.
-
-        Returns:
-            Action to execute.
-        """
-        pass
-
-    def reset(self) -> None:
-        """Reset agent state between episodes.
-
-        Called before starting a new task. Override to clear any
-        internal state.
-        """
-        pass
-
-
 class PolicyAgent(BenchmarkAgent):
     """Wraps openadapt-ml AgentPolicy for benchmark evaluation.
 
@@ -127,61 +99,37 @@ def _build_sample(
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None,
     ) -> dict:
-        """Build SFT-style sample from benchmark observation.
-
-        Args:
-            observation: Current observation.
-            task: Current task.
-            history: Action history.
-
-        Returns:
-            Sample dict with 'images' and 'messages'.
-        """
-        # Build user message content
+        """Build SFT-style sample from benchmark observation."""
         content_parts = [f"Goal: {task.instruction}"]
 
-        # Add accessibility tree if available and enabled
         if self.use_accessibility_tree and observation.accessibility_tree:
             tree_str = self._format_accessibility_tree(observation.accessibility_tree)
             content_parts.append(f"UI Elements:\n{tree_str}")
 
-        # Add context
         if observation.url:
             content_parts.append(f"URL: {observation.url}")
         if observation.window_title:
             content_parts.append(f"Window: {observation.window_title}")
 
-        # Add history if enabled
         if self.use_history and history:
             history_str = self._format_history(history)
             content_parts.append(f"Previous actions:\n{history_str}")
 
         content_parts.append("What action should be taken next?")
 
-        # Build sample
         sample = {
             "messages": [
                 {"role": "user", "content": "\n\n".join(content_parts)},
             ],
         }
 
-        # Add image if available
         if observation.screenshot_path:
             sample["images"] = [observation.screenshot_path]
 
         return sample
 
     def _format_accessibility_tree(self, tree: dict, indent: int = 0) -> str:
-        """Format accessibility tree for prompt.
-
-        Args:
-            tree: Accessibility tree dict.
-            indent: Current indentation level.
-
-        Returns:
-            Formatted string representation.
-        """
-        # Simple formatting - can be overridden for platform-specific formatting
+        """Format accessibility tree for prompt."""
         lines = []
         prefix = "  " * indent
 
@@ -202,29 +150,15 @@ def _format_accessibility_tree(self, tree: dict, indent: int = 0) -> str:
     def _format_history(
         self, history: list[tuple[BenchmarkObservation, BenchmarkAction]]
     ) -> str:
-        """Format action history for prompt.
-
-        Args:
-            history: List of (observation, action) pairs.
-
-        Returns:
-            Formatted string.
-        """
+        """Format action history for prompt."""
         lines = []
-        for i, (obs, action) in enumerate(history[-5:], 1):  # Last 5 actions
+        for i, (obs, action) in enumerate(history[-5:], 1):
             action_str = self._action_to_string(action)
             lines.append(f"{i}. {action_str}")
         return "\n".join(lines)
 
     def _action_to_string(self, action: BenchmarkAction) -> str:
-        """Convert BenchmarkAction to string representation.
-
-        Args:
-            action: Action to convert.
-
-        Returns:
-            String representation.
-        """
+        """Convert BenchmarkAction to string representation."""
         if action.type == "click":
             if action.target_name:
                 return f"CLICK({action.target_name})"
@@ -249,31 +183,19 @@ def _action_to_string(self, action: BenchmarkAction) -> str:
     def _to_benchmark_action(
         self, action: Action, thought: str | None
     ) -> BenchmarkAction:
-        """Convert openadapt-ml Action to BenchmarkAction.
-
-        Args:
-            action: Action from policy.
-            thought: Optional thought/reasoning.
-
-        Returns:
-            BenchmarkAction.
-        """
-        # Extract normalized coordinates
+        """Convert openadapt-ml Action to BenchmarkAction."""
         x, y = None, None
         if action.normalized_coordinates is not None:
             x, y = action.normalized_coordinates
 
-        # Extract end coordinates for drag
         end_x, end_y = None, None
         if action.normalized_end is not None:
             end_x, end_y = action.normalized_end
 
-        # Extract action type value (enum -> string)
         action_type = (
             action.type.value if hasattr(action.type, "value") else action.type
         )
 
-        # Extract element info if available
         target_node_id = None
         target_role = None
         target_name = None
@@ -311,192 +233,28 @@ def _to_benchmark_action(
 
     def reset(self) -> None:
         """Reset agent state."""
-        # PolicyAgent is stateless, nothing to reset
         pass
 
 
-class ScriptedAgent(BenchmarkAgent):
-    """Agent that follows a predefined script of actions.
-
-    Useful for testing benchmark adapters or replaying trajectories.
-
-    Args:
-        actions: List of actions to execute in order.
-    """
-
-    def __init__(self, actions: list[BenchmarkAction]):
-        self.actions = actions
-        self._step = 0
-
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Return the next scripted action.
-
-        Args:
-            observation: Ignored.
-            task: Ignored.
-            history: Ignored.
-
-        Returns:
-            Next action from script, or DONE if script exhausted.
-        """
-        if self._step < len(self.actions):
-            action = self.actions[self._step]
-            self._step += 1
-            return action
-        return BenchmarkAction(type="done")
-
-    def reset(self) -> None:
-        """Reset step counter."""
-        self._step = 0
-
-
-class RandomAgent(BenchmarkAgent):
-    """Agent that takes random actions.
-
-    Useful for baseline comparisons.
-
-    Args:
-        action_types: List of action types to randomly select from.
-        seed: Random seed for reproducibility.
-    """
-
-    def __init__(
-        self,
-        action_types: list[str] | None = None,
-        seed: int | None = None,
-    ):
-        import random
-
-        self.action_types = action_types or ["click", "type", "scroll", "done"]
-        self.rng = random.Random(seed)
-
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Return a random action.
-
-        Args:
-            observation: Used to get viewport bounds.
-            task: Ignored.
-            history: Used to decide when to stop.
-
-        Returns:
-            Random action.
-        """
-        # Stop after many actions
-        if history and len(history) > 20:
-            return BenchmarkAction(type="done")
-
-        action_type = self.rng.choice(self.action_types)
-
-        if action_type == "click":
-            return BenchmarkAction(
-                type="click",
-                x=self.rng.random(),
-                y=self.rng.random(),
-            )
-        elif action_type == "type":
-            return BenchmarkAction(
-                type="type",
-                text="test",
-            )
-        elif action_type == "scroll":
-            return BenchmarkAction(
-                type="scroll",
-                scroll_direction=self.rng.choice(["up", "down"]),
-            )
-        else:
-            return BenchmarkAction(type="done")
-
-    def reset(self) -> None:
-        """Nothing to reset."""
-        pass
-
-
-class SmartMockAgent(BenchmarkAgent):
-    """Agent designed to pass WAAMockAdapter evaluation.
-
-    Performs a fixed sequence of actions that satisfy the mock adapter's
-    success criteria. Use for validating the benchmark pipeline locally.
-
-    The mock adapter evaluates success based on:
-    - Clicking Submit (ID 4) - primary success path
-    - Typing something AND clicking OK (ID 1) - form submission path
-    - Calling DONE after at least 2 actions - reasonable completion
-
-    This agent clicks Submit (ID 4) which is the simplest success path.
-    """
-
-    def __init__(self):
-        """Initialize the agent."""
-        self._step = 0
-        # Simple action sequence: click Submit button (ID 4), then done
-        self._actions = [
-            BenchmarkAction(type="click", target_node_id="4"),  # Click Submit
-            BenchmarkAction(type="done"),
-        ]
-
-    def act(
-        self,
-        observation: BenchmarkObservation,
-        task: BenchmarkTask,
-        history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
-    ) -> BenchmarkAction:
-        """Return the next scripted action.
-
-        Args:
-            observation: Ignored.
-            task: Ignored.
-            history: Ignored.
-
-        Returns:
-            Next action from script, or DONE if script exhausted.
-        """
-        if self._step < len(self._actions):
-            action = self._actions[self._step]
-            self._step += 1
-            return action
-        return BenchmarkAction(type="done")
-
-    def reset(self) -> None:
-        """Reset step counter."""
-        self._step = 0
-
-
 class APIBenchmarkAgent(BenchmarkAgent):
-    """Agent that uses hosted VLM APIs (Claude, GPT-5.1) for benchmark evaluation.
+    """Agent that uses hosted VLM APIs via openadapt-ml ApiVLMAdapter.
 
     This agent wraps ApiVLMAdapter to provide Claude or GPT-5.1 baselines
     for benchmark evaluation. It converts BenchmarkObservation to the
     API format and parses VLM responses into BenchmarkActions.
 
+    Note: For standalone API evaluation without openadapt-ml, use
+    openadapt_evals.ApiAgent instead (has P0 demo persistence fix).
+
     Args:
         provider: API provider - "anthropic" (Claude) or "openai" (GPT-5.1).
         api_key: Optional API key override. If not provided, uses env vars.
-        model: Optional model name override. Defaults to provider's best VLM.
+        model: Optional model name override.
         max_tokens: Maximum tokens for VLM response.
         use_accessibility_tree: Whether to include accessibility tree in prompt.
         use_history: Whether to include action history in prompt.
-
-    Example:
-        # Claude baseline
-        agent = APIBenchmarkAgent(provider="anthropic")
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
-
-        # GPT-5.1 baseline
-        agent = APIBenchmarkAgent(provider="openai")
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
     """
 
-    # System prompt for GUI automation
     SYSTEM_PROMPT = """You are a GUI automation agent. Given a screenshot and task instruction, determine the next action to take.
 
 Available actions:
@@ -506,7 +264,7 @@ class APIBenchmarkAgent(BenchmarkAgent):
 - KEY(key) - Press a key (e.g., Enter, Tab, Escape)
 - KEY(modifier+key) - Press key combination (e.g., Ctrl+c, Alt+Tab)
 - SCROLL(direction) - Scroll up or down
-- DRAG(x1, y1, x2, y2) - Drag from (x1,y1) to (x2,y2) (pixel or normalized)
+- DRAG(x1, y1, x2, y2) - Drag from (x1,y1) to (x2,y2)
 - DONE() - Task is complete
 - ANSWER("response") - For QA tasks, provide the answer
 
@@ -555,32 +313,15 @@ def act(
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
     ) -> BenchmarkAction:
-        """Use VLM API to determine next action.
-
-        Args:
-            observation: Current observation with screenshot.
-            task: Task being performed.
-            history: Previous observations and actions.
-
-        Returns:
-            BenchmarkAction parsed from VLM response.
-        """
+        """Use VLM API to determine next action."""
         adapter = self._get_adapter()
-
-        # Build the sample for the API
         sample = self._build_sample(observation, task, history)
 
-        # Call the VLM API
         try:
             response = adapter.generate(sample, max_new_tokens=self.max_tokens)
         except Exception as e:
-            # On API error, return done to avoid infinite loops
-            return BenchmarkAction(
-                type="done",
-                raw_action={"error": str(e)},
-            )
+            return BenchmarkAction(type="done", raw_action={"error": str(e)})
 
-        # Parse the response into a BenchmarkAction
         return self._parse_response(response, observation)
 
     def _build_sample(
@@ -589,41 +330,26 @@ def _build_sample(
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None,
     ) -> dict[str, Any]:
-        """Build API sample from benchmark observation.
-
-        Args:
-            observation: Current observation.
-            task: Current task.
-            history: Action history.
-
-        Returns:
-            Sample dict with 'images' and 'messages'.
-        """
-        # Build user message content
+        """Build API sample from benchmark observation."""
         content_parts = [f"GOAL: {task.instruction}"]
 
-        # Add context
         if observation.url:
             content_parts.append(f"URL: {observation.url}")
         if observation.window_title:
             content_parts.append(f"Window: {observation.window_title}")
 
-        # Add accessibility tree if available and enabled
         if self.use_accessibility_tree and observation.accessibility_tree:
             tree_str = self._format_accessibility_tree(observation.accessibility_tree)
-            # Truncate if too long
             if len(tree_str) > 4000:
                 tree_str = tree_str[:4000] + "\n... (truncated)"
             content_parts.append(f"UI Elements:\n{tree_str}")
 
-        # Add history if enabled
         if self.use_history and history:
             history_str = self._format_history(history)
             content_parts.append(f"Previous actions:\n{history_str}")
 
         content_parts.append("\nWhat is the next action?")
 
-        # Build sample
         sample: dict[str, Any] = {
             "messages": [
                 {"role": "system", "content": self.SYSTEM_PROMPT},
@@ -631,22 +357,13 @@ def _build_sample(
             ],
         }
 
-        # Add image if available
         if observation.screenshot_path:
             sample["images"] = [observation.screenshot_path]
 
         return sample
 
     def _format_accessibility_tree(self, tree: dict, indent: int = 0) -> str:
-        """Format accessibility tree for prompt.
-
-        Args:
-            tree: Accessibility tree dict.
-            indent: Current indentation level.
-
-        Returns:
-            Formatted string representation.
-        """
+        """Format accessibility tree for prompt."""
         lines = []
         prefix = "  " * indent
 
@@ -667,29 +384,15 @@ def _format_accessibility_tree(self, tree: dict, indent: int = 0) -> str:
     def _format_history(
         self, history: list[tuple[BenchmarkObservation, BenchmarkAction]]
     ) -> str:
-        """Format action history for prompt.
-
-        Args:
-            history: List of (observation, action) pairs.
-
-        Returns:
-            Formatted string.
-        """
+        """Format action history for prompt."""
         lines = []
-        for i, (obs, action) in enumerate(history[-5:], 1):  # Last 5 actions
+        for i, (obs, action) in enumerate(history[-5:], 1):
             action_str = self._action_to_string(action)
             lines.append(f"{i}. {action_str}")
         return "\n".join(lines)
 
     def _action_to_string(self, action: BenchmarkAction) -> str:
-        """Convert BenchmarkAction to string representation.
-
-        Args:
-            action: Action to convert.
-
-        Returns:
-            String representation.
-        """
+        """Convert BenchmarkAction to string representation."""
         if action.type == "click":
             if action.target_node_id:
                 return f"CLICK([{action.target_node_id}])"
@@ -718,32 +421,14 @@ def _action_to_string(self, action: BenchmarkAction) -> str:
     def _parse_response(
         self, response: str, observation: BenchmarkObservation | None = None
     ) -> BenchmarkAction:
-        """Parse VLM response into BenchmarkAction.
-
-        Handles various response formats:
-        - ACTION: CLICK(0.5, 0.3)
-        - CLICK(0.5, 0.3)
-        - I'll click at coordinates (0.5, 0.3) -> CLICK(0.5, 0.3)
-
-        Args:
-            response: Raw VLM response text.
-            observation: Current observation (used for coordinate normalization).
-
-        Returns:
-            Parsed BenchmarkAction.
-        """
-        # Store raw response for debugging
+        """Parse VLM response into BenchmarkAction."""
         raw_action = {"response": response}
 
-        # Extract action line (look for ACTION: prefix or action pattern)
         action_line = None
-
-        # Try to find ACTION: prefix
         action_match = re.search(r"ACTION:\s*(.+)", response, re.IGNORECASE)
         if action_match:
             action_line = action_match.group(1).strip()
         else:
-            # Look for action pattern anywhere in response
             patterns = [
                 r"(CLICK\s*\([^)]+\))",
                 r"(TYPE\s*\([^)]+\))",
@@ -760,202 +445,102 @@ def _parse_response(
                     break
 
         if not action_line:
-            # Could not parse action, return done
             raw_action["parse_error"] = "No action pattern found"
             return BenchmarkAction(type="done", raw_action=raw_action)
 
-        # Parse CLICK action
+        # Parse CLICK([id])
         click_match = re.match(
             r"CLICK\s*\(\s*\[?(\d+)\]?\s*\)", action_line, re.IGNORECASE
         )
         if click_match:
-            # CLICK([id]) - element ID
             node_id = click_match.group(1)
-            return BenchmarkAction(
-                type="click",
-                target_node_id=node_id,
-                raw_action=raw_action,
-            )
+            return BenchmarkAction(type="click", target_node_id=node_id, raw_action=raw_action)
 
+        # Parse CLICK(x, y)
         click_coords = re.match(
             r"CLICK\s*\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)", action_line, re.IGNORECASE
         )
         if click_coords:
-            # CLICK(x, y) - coordinates
             x = float(click_coords.group(1))
             y = float(click_coords.group(2))
-
-            # Normalize coordinates if they appear to be pixel values
-            # If x or y > 1.0, assume pixel coordinates and normalize using viewport
             if observation and observation.viewport and (x > 1.0 or y > 1.0):
                 width, height = observation.viewport
-                x_norm = x / width
-                y_norm = y / height
                 raw_action["original_coords"] = {"x": x, "y": y}
                 raw_action["normalized"] = True
-                x = x_norm
-                y = y_norm
-
-            return BenchmarkAction(
-                type="click",
-                x=x,
-                y=y,
-                raw_action=raw_action,
-            )
+                x, y = x / width, y / height
+            return BenchmarkAction(type="click", x=x, y=y, raw_action=raw_action)
 
-        # Parse TYPE action
+        # Parse TYPE
         type_match = re.match(
             r"TYPE\s*\(\s*[\"'](.+?)[\"']\s*\)", action_line, re.IGNORECASE
         )
         if type_match:
-            text = type_match.group(1)
-            return BenchmarkAction(
-                type="type",
-                text=text,
-                raw_action=raw_action,
-            )
+            return BenchmarkAction(type="type", text=type_match.group(1), raw_action=raw_action)
 
-        # Parse KEY action
+        # Parse KEY
         key_match = re.match(r"KEY\s*\(\s*(.+?)\s*\)", action_line, re.IGNORECASE)
         if key_match:
             key_str = key_match.group(1)
-            # Handle modifier+key format
             if "+" in key_str:
                 parts = key_str.split("+")
-                key = parts[-1]
-                modifiers = parts[:-1]
-                return BenchmarkAction(
-                    type="key",
-                    key=key,
-                    modifiers=modifiers,
-                    raw_action=raw_action,
-                )
-            return BenchmarkAction(
-                type="key",
-                key=key_str,
-                raw_action=raw_action,
-            )
+                return BenchmarkAction(type="key", key=parts[-1], modifiers=parts[:-1], raw_action=raw_action)
+            return BenchmarkAction(type="key", key=key_str, raw_action=raw_action)
 
-        # Parse SCROLL action
+        # Parse SCROLL
         scroll_match = re.match(
             r"SCROLL\s*\(\s*(up|down)\s*\)", action_line, re.IGNORECASE
         )
         if scroll_match:
-            direction = scroll_match.group(1).lower()
-            return BenchmarkAction(
-                type="scroll",
-                scroll_direction=direction,
-                raw_action=raw_action,
-            )
+            return BenchmarkAction(type="scroll", scroll_direction=scroll_match.group(1).lower(), raw_action=raw_action)
 
-        # Parse DRAG action
+        # Parse DRAG
         drag_match = re.match(
             r"DRAG\s*\(\s*([\d.]+)\s*,\s*([\d.]+)\s*,\s*([\d.]+)\s*,\s*([\d.]+)\s*\)",
-            action_line,
-            re.IGNORECASE,
+            action_line, re.IGNORECASE,
         )
         if drag_match:
-            x = float(drag_match.group(1))
-            y = float(drag_match.group(2))
-            end_x = float(drag_match.group(3))
-            end_y = float(drag_match.group(4))
-
-            # Normalize coordinates if they appear to be pixel values
-            if (
-                observation
-                and observation.viewport
-                and (x > 1.0 or y > 1.0 or end_x > 1.0 or end_y > 1.0)
-            ):
+            x, y = float(drag_match.group(1)), float(drag_match.group(2))
+            end_x, end_y = float(drag_match.group(3)), float(drag_match.group(4))
+            if observation and observation.viewport and (x > 1.0 or y > 1.0 or end_x > 1.0 or end_y > 1.0):
                 width, height = observation.viewport
-                raw_action["original_coords"] = {
-                    "x": x,
-                    "y": y,
-                    "end_x": end_x,
-                    "end_y": end_y,
-                }
+                raw_action["original_coords"] = {"x": x, "y": y, "end_x": end_x, "end_y": end_y}
                 raw_action["normalized"] = True
-                x = x / width
-                y = y / height
-                end_x = end_x / width
-                end_y = end_y / height
+                x, y, end_x, end_y = x/width, y/height, end_x/width, end_y/height
+            return BenchmarkAction(type="drag", x=x, y=y, end_x=end_x, end_y=end_y, raw_action=raw_action)
 
-            return BenchmarkAction(
-                type="drag",
-                x=x,
-                y=y,
-                end_x=end_x,
-                end_y=end_y,
-                raw_action=raw_action,
-            )
-
-        # Parse DONE action
+        # Parse DONE
         if re.match(r"DONE\s*\(\s*\)", action_line, re.IGNORECASE):
             return BenchmarkAction(type="done", raw_action=raw_action)
 
-        # Parse ANSWER action
+        # Parse ANSWER
         answer_match = re.match(
             r"ANSWER\s*\(\s*[\"'](.+?)[\"']\s*\)", action_line, re.IGNORECASE
         )
         if answer_match:
-            answer = answer_match.group(1)
-            return BenchmarkAction(
-                type="answer",
-                answer=answer,
-                raw_action=raw_action,
-            )
+            return BenchmarkAction(type="answer", answer=answer_match.group(1), raw_action=raw_action)
 
-        # Unknown action format
         raw_action["parse_error"] = f"Unknown action format: {action_line}"
         return BenchmarkAction(type="done", raw_action=raw_action)
 
     def reset(self) -> None:
         """Reset agent state."""
-        # APIBenchmarkAgent is stateless, nothing to reset
         pass
 
 
 class UnifiedBaselineAgent(BenchmarkAgent):
-    """Agent that uses the UnifiedBaselineAdapter for benchmark evaluation.
-
-    This agent provides a unified interface for comparing Claude, GPT, and Gemini
-    models across multiple evaluation tracks (coordinates, ReAct, SoM).
+    """Agent that uses UnifiedBaselineAdapter for benchmark evaluation.
 
-    Compared to APIBenchmarkAgent, this agent:
-    - Uses the new provider abstraction (models/providers/)
-    - Supports multiple tracks (A, B, C) with track-specific prompts
-    - Uses the unified response parser
-    - Supports model aliases for easy switching
+    Provides unified interface for Claude, GPT, and Gemini baselines
+    across multiple tracks (A: coordinates, B: ReAct, C: SoM).
 
     Args:
-        model_alias: Model alias (e.g., 'claude-opus-4.5', 'gpt-5.2', 'gemini-3-pro').
+        model_alias: Model alias (e.g., 'claude-opus-4.5', 'gpt-5.2').
         track: Track type ('A', 'B', or 'C'). Defaults to 'A'.
-        api_key: Optional API key override. If not provided, uses env vars.
-        temperature: Sampling temperature. Defaults to 0.1.
-        max_tokens: Maximum tokens for response. Defaults to 1024.
-        demo: Optional demo text to include in prompts.
-        verbose: Whether to print verbose debug output.
-
-    Example:
-        # Claude baseline with Track C (Set-of-Mark)
-        agent = UnifiedBaselineAgent(
-            model_alias="claude-opus-4.5",
-            track="C",
-        )
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
-
-        # GPT baseline with Track A (direct coordinates)
-        agent = UnifiedBaselineAgent(
-            model_alias="gpt-5.2",
-            track="A",
-        )
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
-
-        # Gemini baseline with Track B (ReAct reasoning)
-        agent = UnifiedBaselineAgent(
-            model_alias="gemini-3-pro",
-            track="B",
-        )
-        results = evaluate_agent_on_benchmark(agent, waa_adapter)
+        api_key: Optional API key override.
+        temperature: Sampling temperature.
+        max_tokens: Maximum tokens for response.
+        demo: Optional demo text for prompts.
+        verbose: Whether to print debug output.
     """
 
     def __init__(
@@ -980,12 +565,8 @@ def __init__(
     def _get_adapter(self):
         """Lazily initialize the UnifiedBaselineAdapter."""
         if self._adapter is None:
-            from openadapt_ml.baselines import (
-                TrackConfig,
-                UnifiedBaselineAdapter,
-            )
+            from openadapt_ml.baselines import TrackConfig, UnifiedBaselineAdapter
 
-            # Select track config
             track_configs = {
                 "A": TrackConfig.track_a(),
                 "B": TrackConfig.track_b(),
@@ -993,7 +574,6 @@ def _get_adapter(self):
             }
             track_config = track_configs.get(self.track, TrackConfig.track_a())
 
-            # Create adapter from alias
             self._adapter = UnifiedBaselineAdapter.from_alias(
                 self.model_alias,
                 track=track_config,
@@ -1011,21 +591,11 @@ def act(
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
     ) -> BenchmarkAction:
-        """Use UnifiedBaselineAdapter to determine next action.
-
-        Args:
-            observation: Current observation with screenshot.
-            task: Task being performed.
-            history: Previous observations and actions.
-
-        Returns:
-            BenchmarkAction parsed from adapter response.
-        """
+        """Use UnifiedBaselineAdapter to determine next action."""
         from PIL import Image
 
         adapter = self._get_adapter()
 
-        # Load screenshot if available
         screenshot = None
         if observation.screenshot_path:
             try:
@@ -1034,19 +604,12 @@ def act(
                 if self.verbose:
                     print(f"[UnifiedBaselineAgent] Failed to load screenshot: {e}")
 
-        # Build accessibility tree string
-        a11y_tree = None
-        if observation.accessibility_tree:
-            a11y_tree = observation.accessibility_tree
+        a11y_tree = observation.accessibility_tree if observation.accessibility_tree else None
 
-        # Build history for adapter
         adapter_history = None
         if history:
-            adapter_history = []
-            for obs, action in history[-5:]:  # Last 5 actions
-                adapter_history.append(self._benchmark_action_to_dict(action))
+            adapter_history = [self._benchmark_action_to_dict(a) for _, a in history[-5:]]
 
-        # Call adapter
         try:
             parsed_action = adapter.predict(
                 screenshot=screenshot,
@@ -1057,18 +620,13 @@ def act(
         except Exception as e:
             if self.verbose:
                 print(f"[UnifiedBaselineAgent] Adapter error: {e}")
-            return BenchmarkAction(
-                type="done",
-                raw_action={"error": str(e)},
-            )
+            return BenchmarkAction(type="done", raw_action={"error": str(e)})
 
-        # Convert ParsedAction to BenchmarkAction
         return self._parsed_to_benchmark_action(parsed_action, observation)
 
     def _benchmark_action_to_dict(self, action: BenchmarkAction) -> dict[str, Any]:
         """Convert BenchmarkAction to dict for history."""
         result = {"type": action.type}
-
         if action.x is not None:
             result["x"] = action.x
         if action.y is not None:
@@ -1081,23 +639,12 @@ def _benchmark_action_to_dict(self, action: BenchmarkAction) -> dict[str, Any]:
             result["element_id"] = action.target_node_id
         if action.scroll_direction:
             result["direction"] = action.scroll_direction
-
         return result
 
     def _parsed_to_benchmark_action(
-        self,
-        parsed_action,
-        observation: BenchmarkObservation | None = None,
+        self, parsed_action, observation: BenchmarkObservation | None = None
     ) -> BenchmarkAction:
-        """Convert ParsedAction to BenchmarkAction.
-
-        Args:
-            parsed_action: ParsedAction from adapter.
-            observation: Current observation (for coordinate normalization).
-
-        Returns:
-            BenchmarkAction.
-        """
+        """Convert ParsedAction to BenchmarkAction."""
         raw_action = {
             "raw_response": parsed_action.raw_response,
             "thought": parsed_action.thought,
@@ -1112,75 +659,42 @@ def _parsed_to_benchmark_action(
         if action_type == "click":
             if parsed_action.element_id is not None:
                 return BenchmarkAction(
-                    type="click",
-                    target_node_id=str(parsed_action.element_id),
-                    raw_action=raw_action,
+                    type="click", target_node_id=str(parsed_action.element_id), raw_action=raw_action
                 )
             elif parsed_action.x is not None and parsed_action.y is not None:
-                x = parsed_action.x
-                y = parsed_action.y
-
-                # Normalize coordinates if they appear to be pixel values
+                x, y = parsed_action.x, parsed_action.y
                 if observation and observation.viewport and (x > 1.0 or y > 1.0):
                     width, height = observation.viewport
                     raw_action["original_coords"] = {"x": x, "y": y}
-                    raw_action["normalized"] = True
-                    x = x / width
-                    y = y / height
-
-                return BenchmarkAction(
-                    type="click",
-                    x=x,
-                    y=y,
-                    raw_action=raw_action,
-                )
+                    x, y = x / width, y / height
+                return BenchmarkAction(type="click", x=x, y=y, raw_action=raw_action)
 
         elif action_type == "type":
-            return BenchmarkAction(
-                type="type",
-                text=parsed_action.text,
-                raw_action=raw_action,
-            )
+            return BenchmarkAction(type="type", text=parsed_action.text, raw_action=raw_action)
 
         elif action_type == "key":
-            return BenchmarkAction(
-                type="key",
-                key=parsed_action.key,
-                raw_action=raw_action,
-            )
+            return BenchmarkAction(type="key", key=parsed_action.key, raw_action=raw_action)
 
         elif action_type == "scroll":
-            return BenchmarkAction(
-                type="scroll",
-                scroll_direction=parsed_action.direction,
-                raw_action=raw_action,
-            )
+            return BenchmarkAction(type="scroll", scroll_direction=parsed_action.direction, raw_action=raw_action)
 
         elif action_type == "done":
             return BenchmarkAction(type="done", raw_action=raw_action)
 
         elif action_type == "drag":
-            x = parsed_action.x
-            y = parsed_action.y
-            end_x = getattr(parsed_action, "end_x", None)
-            end_y = getattr(parsed_action, "end_y", None)
-
             return BenchmarkAction(
                 type="drag",
-                x=x,
-                y=y,
-                end_x=end_x,
-                end_y=end_y,
+                x=parsed_action.x, y=parsed_action.y,
+                end_x=getattr(parsed_action, "end_x", None),
+                end_y=getattr(parsed_action, "end_y", None),
                 raw_action=raw_action,
             )
 
-        # Unknown action type, return done
         raw_action["unknown_action"] = action_type
         return BenchmarkAction(type="done", raw_action=raw_action)
 
     def reset(self) -> None:
         """Reset agent state."""
-        # UnifiedBaselineAgent is stateless, nothing to reset
         pass
 
     def __repr__(self) -> str:
diff --git a/openadapt_ml/benchmarks/base.py b/openadapt_ml/benchmarks/base.py
index 522914e..569b6ed 100644
--- a/openadapt_ml/benchmarks/base.py
+++ b/openadapt_ml/benchmarks/base.py
@@ -1,368 +1,36 @@
-"""Base classes for benchmark integration.
+"""DEPRECATED: Import from openadapt_evals instead.
 
-This module provides the core abstractions for integrating GUI agent benchmarks
-into openadapt-ml. It supports both interactive environments (WAA, OSWorld) and
-static trajectory datasets (Mind2Web).
-
-Example:
-    from openadapt_ml.benchmarks import WAAAdapter, evaluate_agent_on_benchmark
-
-    adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
+This module is kept for backward compatibility only.
+All classes are now provided by openadapt_evals.adapters.base.
 """
 
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Iterator
-
-if TYPE_CHECKING:
-    pass
-
-
-@dataclass
-class BenchmarkTask:
-    """Canonical task representation.
-
-    Attributes:
-        task_id: Unique identifier for the task.
-        instruction: Natural language task instruction.
-        domain: Task domain ("web", "desktop", "mobile").
-        initial_state_ref: Reference to initial state (VM snapshot, URL, etc.).
-        time_limit_steps: Maximum steps allowed for the task.
-        raw_config: Original benchmark config (lossless preservation).
-        evaluation_spec: Benchmark-native evaluation specification.
-    """
-
-    task_id: str
-    instruction: str
-    domain: str  # "web", "desktop", "mobile"
-
-    # Environment setup
-    initial_state_ref: str | None = None  # VM snapshot, storage_state, start URL
-    time_limit_steps: int | None = None
-
-    # Preserve original config losslessly
-    raw_config: dict[str, Any] = field(default_factory=dict)
-
-    # Evaluation spec (benchmark-native)
-    evaluation_spec: dict[str, Any] | None = None
-
-
-@dataclass
-class BenchmarkObservation:
-    """Canonical observation at each step.
-
-    Supports multiple observation modalities:
-    - Visual: screenshots with viewport info
-    - Structured UI: accessibility tree (UIA/AXTree/DOM)
-    - Context: URL, window title, focused element
-
-    Attributes:
-        screenshot: PNG image bytes.
-        screenshot_path: Path to saved screenshot.
-        viewport: (width, height) of the viewport.
-        accessibility_tree: Platform-specific UI tree (UIA/AXTree/DOM).
-        dom_html: Raw HTML for web tasks.
-        url: Current URL for web tasks.
-        window_title: Active window title for desktop tasks.
-        focused_element: Currently focused UI element.
-        raw_observation: Original benchmark observation (lossless).
-    """
-
-    # Visual
-    screenshot: bytes | None = None  # PNG image bytes
-    screenshot_path: str | None = None
-    viewport: tuple[int, int] | None = None  # (width, height)
-
-    # Structured UI (format varies by platform)
-    accessibility_tree: dict | None = None  # UIA (Windows), AXTree (macOS), DOM (web)
-    dom_html: str | None = None  # Raw HTML for web
-
-    # Context
-    url: str | None = None  # For web tasks
-    window_title: str | None = None  # For desktop tasks
-    app_name: str | None = None  # Active application
-    focused_element: dict | None = None  # {node_id, bbox, text}
-
-    # Raw benchmark-specific data (lossless)
-    raw_observation: dict[str, Any] | None = None
-
-
-@dataclass
-class BenchmarkAction:
-    """Canonical action representation.
-
-    Supports multiple action types with both coordinate-based and element-based
-    grounding. The "grounding-first" approach stores both when available.
-
-    Attributes:
-        type: Action type ("click", "type", "scroll", "key", "drag", "answer", "done").
-        x: X coordinate (normalized [0,1] or pixels).
-        y: Y coordinate (normalized [0,1] or pixels).
-        target_node_id: Element ID from accessibility tree.
-        target_bbox: Element bounding box.
-        target_role: Element role (button, textfield, etc.).
-        target_name: Element accessible name.
-        text: Text to type (for "type" action).
-        key: Single key (for "key" action, e.g., "Enter", "Tab").
-        modifiers: Key modifiers (["ctrl", "shift", "alt"]).
-        scroll_direction: Scroll direction ("up", "down", "left", "right").
-        scroll_amount: Scroll amount (pixels or normalized).
-        end_x: Drag end X coordinate.
-        end_y: Drag end Y coordinate.
-        answer: Answer string (for benchmarks that score by answer).
-        raw_action: Original benchmark action (lossless).
-    """
-
-    type: str  # "click", "type", "scroll", "key", "drag", "answer", "done"
-
-    # Pointer actions - coordinates
-    x: float | None = None  # Normalized [0,1] or pixel
-    y: float | None = None
-
-    # Element grounding (when available)
-    target_node_id: str | None = None  # DOM/AX/UIA node ID
-    target_bbox: tuple[float, float, float, float] | None = None
-    target_role: str | None = None  # "button", "textfield", etc.
-    target_name: str | None = None  # Accessible name
-
-    # Keyboard actions
-    text: str | None = None  # For "type" action - text to type
-    key: str | None = None  # For "key" action - single key
-    modifiers: list[str] | None = None  # ["ctrl", "shift", "alt"]
-
-    # Scroll actions
-    scroll_direction: str | None = None  # "up", "down", "left", "right"
-    scroll_amount: float | None = None  # Pixels or normalized
-
-    # Drag actions
-    end_x: float | None = None
-    end_y: float | None = None
-
-    # Answer action (some benchmarks score by final answer)
-    answer: str | None = None
-
-    # Raw benchmark-specific format (lossless)
-    raw_action: dict[str, Any] | None = None
-
-
-@dataclass
-class BenchmarkResult:
-    """Result of a single task evaluation.
-
-    Attributes:
-        task_id: ID of the evaluated task.
-        success: Whether the task was completed successfully.
-        score: Score between 0.0 and 1.0.
-        steps: List of (observation, action) pairs from the trajectory.
-        num_steps: Number of steps taken.
-        error: Error message if task failed due to error.
-        reason: Explanation of success/failure.
-        total_time_seconds: Total time taken for the task.
-    """
-
-    task_id: str
-    success: bool
-    score: float  # 0.0 to 1.0
-
-    # Trajectory
-    steps: list[tuple[BenchmarkObservation, BenchmarkAction]] = field(
-        default_factory=list
-    )
-    num_steps: int = 0
-
-    # Diagnostics
-    error: str | None = None
-    reason: str | None = None  # Why success/fail
-
-    # Timing
-    total_time_seconds: float = 0.0
-
-
-@dataclass
-class UIElement:
-    """Normalized UI element for cross-platform use.
-
-    Provides a common representation for UI elements across platforms
-    (Windows UIA, macOS AXTree, web DOM).
-
-    Attributes:
-        node_id: Unique identifier for the element.
-        role: Element role (button, textfield, link, etc.).
-        name: Accessible name/label.
-        bbox: Bounding box (normalized [0,1] or pixels).
-        text: Text content.
-        value: Current value (for inputs).
-        children: Child elements.
-        attributes: Additional platform-specific attributes.
-    """
-
-    node_id: str
-    role: str  # "button", "textfield", "link", etc.
-    name: str | None = None  # Accessible name/label
-    bbox: tuple[float, float, float, float] | None = None  # (x1, y1, x2, y2)
-    text: str | None = None  # Text content
-    value: str | None = None  # Current value (for inputs)
-    children: list[UIElement] | None = None
-    attributes: dict[str, Any] | None = None  # Platform-specific
-
-
-class BenchmarkAdapter(ABC):
-    """Abstract interface for benchmark integration.
-
-    Subclasses implement this interface to integrate specific benchmarks
-    (WAA, OSWorld, WebArena, etc.) with openadapt-ml.
-
-    Two types of adapters:
-    - Interactive: Run environment, step through tasks (WAA, OSWorld)
-    - Static: Load trajectories for offline training/eval (Mind2Web)
-    """
-
-    @property
-    @abstractmethod
-    def name(self) -> str:
-        """Benchmark name (e.g., 'waa', 'osworld', 'webarena')."""
-        pass
-
-    @property
-    @abstractmethod
-    def benchmark_type(self) -> str:
-        """Benchmark type: 'interactive' or 'static'."""
-        pass
-
-    @property
-    def supports_parallel(self) -> bool:
-        """Whether the adapter supports parallel task execution."""
-        return False
-
-    @abstractmethod
-    def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
-        """List available tasks, optionally filtered by domain.
-
-        Args:
-            domain: Optional domain filter (e.g., "browser", "office").
-
-        Returns:
-            List of BenchmarkTask objects.
-        """
-        pass
-
-    @abstractmethod
-    def load_task(self, task_id: str) -> BenchmarkTask:
-        """Load a specific task by ID.
-
-        Args:
-            task_id: Task identifier.
-
-        Returns:
-            BenchmarkTask object.
-
-        Raises:
-            KeyError: If task_id not found.
-        """
-        pass
-
-    @abstractmethod
-    def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
-        """Reset environment to task's initial state.
-
-        Args:
-            task: Task to initialize.
-
-        Returns:
-            Initial observation.
-        """
-        pass
-
-    @abstractmethod
-    def step(
-        self, action: BenchmarkAction
-    ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
-        """Execute action and return new observation.
-
-        Args:
-            action: Action to execute.
-
-        Returns:
-            Tuple of (observation, done, info).
-        """
-        pass
-
-    @abstractmethod
-    def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Run benchmark's native evaluation on current state.
-
-        Args:
-            task: Task to evaluate.
-
-        Returns:
-            BenchmarkResult with success/score.
-        """
-        pass
-
-    def close(self) -> None:
-        """Clean up resources (VMs, browser, etc.)."""
-        pass
-
-    def __enter__(self) -> BenchmarkAdapter:
-        """Context manager entry."""
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
-        """Context manager exit."""
-        self.close()
-
-
-class StaticDatasetAdapter(BenchmarkAdapter):
-    """Base for static trajectory datasets (Mind2Web, demos).
-
-    Static adapters load pre-recorded trajectories for offline training
-    or evaluation, rather than running an interactive environment.
-    """
-
-    @property
-    def benchmark_type(self) -> str:
-        """Static datasets are not interactive."""
-        return "static"
-
-    @abstractmethod
-    def load_trajectories(
-        self, split: str = "test"
-    ) -> Iterator[
-        tuple[BenchmarkTask, list[tuple[BenchmarkObservation, BenchmarkAction]]]
-    ]:
-        """Iterate over expert trajectories.
-
-        Args:
-            split: Dataset split ("train", "val", "test").
-
-        Yields:
-            Tuples of (task, trajectory) where trajectory is a list of
-            (observation, action) pairs.
-        """
-        pass
-
-    def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
-        """Not supported for static datasets."""
-        raise NotImplementedError(
-            "Static datasets don't support interactive reset. "
-            "Use load_trajectories() instead."
-        )
-
-    def step(
-        self, action: BenchmarkAction
-    ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
-        """Not supported for static datasets."""
-        raise NotImplementedError(
-            "Static datasets don't support interactive stepping. "
-            "Use load_trajectories() instead."
-        )
-
-    def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Not supported for static datasets."""
-        raise NotImplementedError(
-            "Static datasets don't support execution-based evaluation. "
-            "Use offline metrics instead."
-        )
+import warnings
+
+warnings.warn(
+    "openadapt_ml.benchmarks.base is deprecated. "
+    "Please import from openadapt_evals instead: "
+    "from openadapt_evals import BenchmarkAdapter, BenchmarkTask, BenchmarkAction",
+    DeprecationWarning,
+    stacklevel=2,
+)
+
+# Re-export from canonical location
+from openadapt_evals.adapters.base import (
+    BenchmarkAction,
+    BenchmarkAdapter,
+    BenchmarkObservation,
+    BenchmarkResult,
+    BenchmarkTask,
+    StaticDatasetAdapter,
+    UIElement,
+)
+
+__all__ = [
+    "BenchmarkAction",
+    "BenchmarkAdapter",
+    "BenchmarkObservation",
+    "BenchmarkResult",
+    "BenchmarkTask",
+    "StaticDatasetAdapter",
+    "UIElement",
+]
diff --git a/openadapt_ml/benchmarks/data_collection.py b/openadapt_ml/benchmarks/data_collection.py
index 8147a94..f8e9e17 100644
--- a/openadapt_ml/benchmarks/data_collection.py
+++ b/openadapt_ml/benchmarks/data_collection.py
@@ -1,444 +1,26 @@
-"""Data collection for benchmark viewer integration.
+"""DEPRECATED: Import from openadapt_evals instead.
 
-This module handles saving execution traces during benchmark runs for later
-replay in the benchmark viewer. It creates a structured directory layout with
-screenshots, metadata, and execution traces.
-
-Directory structure:
-    benchmark_results/
-    ├── waa_eval_YYYYMMDD_HHMMSS/
-    │   ├── metadata.json
-    │   ├── tasks/
-    │   │   ├── task_001/
-    │   │   │   ├── task.json
-    │   │   │   ├── screenshots/
-    │   │   │   │   ├── step_000.png
-    │   │   │   │   ├── step_001.png
-    │   │   │   │   └── ...
-    │   │   │   └── execution.json
-    │   │   └── task_002/
-    │   │       └── ...
-    │   └── summary.json
-
-Example:
-    from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
-
-    collector = ExecutionTraceCollector(
-        benchmark_name="waa",
-        run_name="waa_eval_20241214",
-        model_id="qwen3vl-2b-epoch5"
-    )
-
-    # During evaluation
-    collector.start_task(task)
-    for step_idx, (obs, action) in enumerate(trajectory):
-        collector.record_step(step_idx, obs, action, reasoning="...")
-    collector.finish_task(result)
+This module is kept for backward compatibility only.
+All classes are now provided by openadapt_evals.benchmarks.data_collection.
 """
 
-from __future__ import annotations
-
-import json
-import logging
-from dataclasses import asdict, dataclass
-from datetime import datetime
-from pathlib import Path
-from typing import Any
+import warnings
 
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
+warnings.warn(
+    "openadapt_ml.benchmarks.data_collection is deprecated. "
+    "Please import from openadapt_evals instead: "
+    "from openadapt_evals import ExecutionTraceCollector, save_execution_trace",
+    DeprecationWarning,
+    stacklevel=2,
 )
 
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ExecutionStep:
-    """Single step in execution trace.
-
-    Attributes:
-        step_idx: Step index in the trajectory.
-        screenshot_path: Relative path to screenshot image.
-        action: Action taken at this step.
-        reasoning: Optional reasoning/thought from the agent.
-        timestamp: Timestamp when step was recorded.
-    """
-
-    step_idx: int
-    screenshot_path: str | None
-    action: dict[str, Any]  # Serialized BenchmarkAction
-    reasoning: str | None = None
-    timestamp: float | None = None
-
-
-class ExecutionTraceCollector:
-    """Collects execution traces during benchmark runs.
-
-    This class handles:
-    - Creating the directory structure for a benchmark run
-    - Saving screenshots at each step
-    - Recording actions and reasoning
-    - Saving task results and metadata
-
-    Args:
-        benchmark_name: Name of the benchmark (e.g., "waa", "webarena").
-        run_name: Unique name for this evaluation run (e.g., "waa_eval_20241214").
-        model_id: Identifier for the model being evaluated.
-        output_dir: Base directory for benchmark results (default: "./benchmark_results").
-    """
-
-    def __init__(
-        self,
-        benchmark_name: str,
-        run_name: str | None = None,
-        model_id: str = "unknown",
-        output_dir: str | Path = "benchmark_results",
-    ):
-        self.benchmark_name = benchmark_name
-        self.model_id = model_id
-
-        # Auto-generate run_name if not provided
-        if run_name is None:
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            run_name = f"{benchmark_name}_eval_{timestamp}"
-        self.run_name = run_name
-
-        # Set up directory structure
-        self.output_dir = Path(output_dir)
-        self.run_dir = self.output_dir / run_name
-        self.tasks_dir = self.run_dir / "tasks"
-
-        # Current task tracking
-        self._current_task: BenchmarkTask | None = None
-        self._current_task_dir: Path | None = None
-        self._current_screenshots_dir: Path | None = None
-        self._current_steps: list[ExecutionStep] = []
-
-        # Initialize run
-        self._initialize_run()
-
-    def _initialize_run(self) -> None:
-        """Initialize the benchmark run directory and metadata."""
-        self.run_dir.mkdir(parents=True, exist_ok=True)
-        self.tasks_dir.mkdir(exist_ok=True)
-
-        # Save run metadata
-        metadata = {
-            "benchmark_name": self.benchmark_name,
-            "run_name": self.run_name,
-            "model_id": self.model_id,
-            "created_at": datetime.now().isoformat(),
-        }
-
-        metadata_path = self.run_dir / "metadata.json"
-        with open(metadata_path, "w") as f:
-            json.dump(metadata, f, indent=2)
-
-        logger.info(f"Initialized benchmark run at: {self.run_dir}")
-
-    def start_task(self, task: BenchmarkTask) -> None:
-        """Start collecting data for a new task.
-
-        Args:
-            task: The benchmark task being executed.
-        """
-        if self._current_task is not None:
-            logger.warning(
-                f"Starting new task {task.task_id} without finishing {self._current_task.task_id}"
-            )
-
-        self._current_task = task
-        self._current_steps = []
-
-        # Create task directory
-        task_dir_name = self._sanitize_task_id(task.task_id)
-        self._current_task_dir = self.tasks_dir / task_dir_name
-        self._current_task_dir.mkdir(parents=True, exist_ok=True)
-
-        # Create screenshots directory
-        self._current_screenshots_dir = self._current_task_dir / "screenshots"
-        self._current_screenshots_dir.mkdir(exist_ok=True)
-
-        # Save task definition
-        task_data = {
-            "task_id": task.task_id,
-            "instruction": task.instruction,
-            "domain": task.domain,
-            "initial_state_ref": task.initial_state_ref,
-            "time_limit_steps": task.time_limit_steps,
-            "raw_config": task.raw_config,
-            "evaluation_spec": task.evaluation_spec,
-        }
-
-        task_path = self._current_task_dir / "task.json"
-        with open(task_path, "w") as f:
-            json.dump(task_data, f, indent=2)
-
-        logger.info(f"Started collecting data for task: {task.task_id}")
-
-    def record_step(
-        self,
-        step_idx: int,
-        observation: BenchmarkObservation,
-        action: BenchmarkAction,
-        reasoning: str | None = None,
-    ) -> None:
-        """Record a single step in the execution trace.
-
-        Args:
-            step_idx: Index of this step in the trajectory.
-            observation: Observation at this step.
-            action: Action taken at this step.
-            reasoning: Optional reasoning/thought from the agent.
-        """
-        if self._current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-
-        # Save screenshot if available
-        screenshot_path = None
-        if observation.screenshot is not None:
-            screenshot_path = self._save_screenshot(step_idx, observation.screenshot)
-        elif observation.screenshot_path is not None:
-            # Copy existing screenshot
-            screenshot_path = self._copy_screenshot(
-                step_idx, observation.screenshot_path
-            )
-
-        # Create execution step record
-        step = ExecutionStep(
-            step_idx=step_idx,
-            screenshot_path=screenshot_path,
-            action=self._serialize_action(action),
-            reasoning=reasoning,
-            timestamp=datetime.now().timestamp(),
-        )
-
-        self._current_steps.append(step)
-
-    def finish_task(self, result: BenchmarkResult) -> None:
-        """Finish collecting data for the current task and save execution trace.
-
-        Args:
-            result: The evaluation result for the task.
-        """
-        if self._current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-
-        # Save execution trace
-        execution_data = {
-            "task_id": result.task_id,
-            "model_id": self.model_id,
-            "success": result.success,
-            "score": result.score,
-            "num_steps": result.num_steps,
-            "total_time_seconds": result.total_time_seconds,
-            "error": result.error,
-            "reason": result.reason,
-            "steps": [asdict(step) for step in self._current_steps],
-        }
-
-        execution_path = self._current_task_dir / "execution.json"
-        with open(execution_path, "w") as f:
-            json.dump(execution_data, f, indent=2)
-
-        logger.info(
-            f"Saved execution trace for task {result.task_id}: "
-            f"{'SUCCESS' if result.success else 'FAIL'} ({result.num_steps} steps)"
-        )
-
-        # Clear current task
-        self._current_task = None
-        self._current_task_dir = None
-        self._current_screenshots_dir = None
-        self._current_steps = []
-
-    def save_summary(self, all_results: list[BenchmarkResult]) -> None:
-        """Save summary of all task results.
-
-        Args:
-            all_results: List of all BenchmarkResult objects from the run.
-        """
-        summary = {
-            "benchmark_name": self.benchmark_name,
-            "run_name": self.run_name,
-            "model_id": self.model_id,
-            "num_tasks": len(all_results),
-            "num_success": sum(1 for r in all_results if r.success),
-            "success_rate": sum(1 for r in all_results if r.success) / len(all_results)
-            if all_results
-            else 0.0,
-            "avg_score": sum(r.score for r in all_results) / len(all_results)
-            if all_results
-            else 0.0,
-            "avg_steps": sum(r.num_steps for r in all_results) / len(all_results)
-            if all_results
-            else 0.0,
-            "avg_time_seconds": sum(r.total_time_seconds for r in all_results)
-            / len(all_results)
-            if all_results
-            else 0.0,
-            "tasks": [
-                {
-                    "task_id": r.task_id,
-                    "success": r.success,
-                    "score": r.score,
-                    "num_steps": r.num_steps,
-                    "error": r.error,
-                }
-                for r in all_results
-            ],
-        }
-
-        summary_path = self.run_dir / "summary.json"
-        with open(summary_path, "w") as f:
-            json.dump(summary, f, indent=2)
-
-        logger.info(
-            f"Saved summary: {summary['num_success']}/{summary['num_tasks']} tasks succeeded "
-            f"({summary['success_rate']:.1%})"
-        )
-
-    def _save_screenshot(self, step_idx: int, screenshot_bytes: bytes) -> str:
-        """Save screenshot bytes to file.
-
-        Args:
-            step_idx: Step index for naming the file.
-            screenshot_bytes: PNG image bytes.
-
-        Returns:
-            Relative path to the saved screenshot.
-        """
-        if self._current_screenshots_dir is None:
-            raise RuntimeError("No task started")
-
-        filename = f"step_{step_idx:03d}.png"
-        screenshot_path = self._current_screenshots_dir / filename
-
-        with open(screenshot_path, "wb") as f:
-            f.write(screenshot_bytes)
-
-        # Return relative path from task directory
-        return f"screenshots/{filename}"
-
-    def _copy_screenshot(self, step_idx: int, source_path: str) -> str:
-        """Copy screenshot from existing path.
-
-        Args:
-            step_idx: Step index for naming the file.
-            source_path: Path to existing screenshot.
-
-        Returns:
-            Relative path to the copied screenshot.
-        """
-        if self._current_screenshots_dir is None:
-            raise RuntimeError("No task started")
-
-        filename = f"step_{step_idx:03d}.png"
-        dest_path = self._current_screenshots_dir / filename
-
-        # Copy file
-        import shutil
-
-        shutil.copy2(source_path, dest_path)
-
-        return f"screenshots/{filename}"
-
-    def _serialize_action(self, action: BenchmarkAction) -> dict[str, Any]:
-        """Serialize BenchmarkAction to dict.
-
-        Args:
-            action: Action to serialize.
-
-        Returns:
-            Dictionary representation of the action.
-        """
-        return {
-            "type": action.type,
-            "x": action.x,
-            "y": action.y,
-            "target_node_id": action.target_node_id,
-            "target_bbox": action.target_bbox,
-            "target_role": action.target_role,
-            "target_name": action.target_name,
-            "text": action.text,
-            "key": action.key,
-            "modifiers": action.modifiers,
-            "scroll_direction": action.scroll_direction,
-            "scroll_amount": action.scroll_amount,
-            "end_x": action.end_x,
-            "end_y": action.end_y,
-            "answer": action.answer,
-            "raw_action": action.raw_action,
-        }
-
-    def _sanitize_task_id(self, task_id: str) -> str:
-        """Sanitize task ID for use as directory name.
-
-        Args:
-            task_id: Original task ID.
-
-        Returns:
-            Sanitized task ID safe for filesystem.
-        """
-        # Replace unsafe characters with underscores
-        safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in task_id)
-        return safe_id
-
-
-def save_execution_trace(
-    task: BenchmarkTask,
-    result: BenchmarkResult,
-    trajectory: list[tuple[BenchmarkObservation, BenchmarkAction]],
-    benchmark_name: str,
-    model_id: str = "unknown",
-    output_dir: str | Path = "benchmark_results",
-    run_name: str | None = None,
-    reasoning_map: dict[int, str] | None = None,
-) -> Path:
-    """Convenience function to save a complete execution trace.
-
-    This is a simpler alternative to using ExecutionTraceCollector directly
-    when you have the complete trajectory available.
-
-    Args:
-        task: The benchmark task.
-        result: The evaluation result.
-        trajectory: List of (observation, action) pairs.
-        benchmark_name: Name of the benchmark.
-        model_id: Identifier for the model.
-        output_dir: Base directory for results.
-        run_name: Optional run name (auto-generated if None).
-        reasoning_map: Optional map of step_idx -> reasoning text.
-
-    Returns:
-        Path to the task directory.
-
-    Example:
-        save_execution_trace(
-            task=task,
-            result=result,
-            trajectory=trajectory,
-            benchmark_name="waa",
-            model_id="qwen3vl-2b-epoch5",
-            reasoning_map={0: "I should click the button", 1: "Now type the text"}
-        )
-    """
-    collector = ExecutionTraceCollector(
-        benchmark_name=benchmark_name,
-        run_name=run_name,
-        model_id=model_id,
-        output_dir=output_dir,
-    )
-
-    collector.start_task(task)
-
-    for step_idx, (obs, action) in enumerate(trajectory):
-        reasoning = reasoning_map.get(step_idx) if reasoning_map else None
-        collector.record_step(step_idx, obs, action, reasoning)
-
-    collector.finish_task(result)
+# Re-export from canonical location
+from openadapt_evals.benchmarks.data_collection import (
+    ExecutionTraceCollector,
+    save_execution_trace,
+)
 
-    return collector._current_task_dir or collector.tasks_dir
+__all__ = [
+    "ExecutionTraceCollector",
+    "save_execution_trace",
+]
diff --git a/openadapt_ml/benchmarks/live_tracker.py b/openadapt_ml/benchmarks/live_tracker.py
index 4126a8d..d719864 100644
--- a/openadapt_ml/benchmarks/live_tracker.py
+++ b/openadapt_ml/benchmarks/live_tracker.py
@@ -1,188 +1,24 @@
-"""Live evaluation progress tracker for benchmark viewer.
+"""DEPRECATED: Import from openadapt_evals instead.
 
-This module provides a tracker that writes real-time evaluation progress
-to a JSON file that the viewer can poll via /api/benchmark-live.
+This module is kept for backward compatibility only.
+All classes are now provided by openadapt_evals.benchmarks.live_tracker.
 """
 
-from __future__ import annotations
+import warnings
 
-import json
-from dataclasses import asdict, dataclass
-from pathlib import Path
-from typing import Any
-
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
+warnings.warn(
+    "openadapt_ml.benchmarks.live_tracker is deprecated. "
+    "Please import from openadapt_evals instead: "
+    "from openadapt_evals import LiveEvaluationTracker",
+    DeprecationWarning,
+    stacklevel=2,
 )
 
+# Re-export from canonical location
+from openadapt_evals.benchmarks.live_tracker import (
+    LiveEvaluationTracker,
+)
 
-@dataclass
-class LiveStepData:
-    """Data for a single step in live evaluation."""
-
-    step_idx: int
-    action: dict[str, Any]
-    reasoning: str | None = None
-    screenshot_url: str | None = None
-
-
-@dataclass
-class LiveTaskData:
-    """Data for current task being evaluated."""
-
-    task_id: str
-    instruction: str
-    domain: str
-    steps: list[LiveStepData]
-    result: dict[str, Any] | None = None
-
-
-class LiveEvaluationTracker:
-    """Tracks live evaluation progress and writes to benchmark_live.json.
-
-    This class is designed to be used alongside ExecutionTraceCollector
-    to provide real-time progress updates to the viewer.
-
-    Args:
-        output_file: Path to output JSON file (default: benchmark_live.json).
-        total_tasks: Total number of tasks to evaluate.
-    """
-
-    def __init__(
-        self,
-        output_file: str | Path = "benchmark_live.json",
-        total_tasks: int = 0,
-    ):
-        self.output_file = Path(output_file)
-        self.total_tasks = total_tasks
-        self.tasks_completed = 0
-        self.current_task: LiveTaskData | None = None
-
-        # Initialize with idle state
-        self._write_state({"status": "idle"})
-
-    def start_task(self, task: BenchmarkTask) -> None:
-        """Start tracking a new task.
-
-        Args:
-            task: The benchmark task being evaluated.
-        """
-        self.current_task = LiveTaskData(
-            task_id=task.task_id,
-            instruction=task.instruction,
-            domain=task.domain or "unknown",
-            steps=[],
-            result=None,
-        )
-
-        self._write_state(
-            {
-                "status": "running",
-                "total_tasks": self.total_tasks,
-                "tasks_completed": self.tasks_completed,
-                "current_task": asdict(self.current_task),
-            }
-        )
-
-    def record_step(
-        self,
-        step_idx: int,
-        observation: BenchmarkObservation,
-        action: BenchmarkAction,
-        reasoning: str | None = None,
-    ) -> None:
-        """Record a step in the current task.
-
-        Args:
-            step_idx: Index of this step.
-            observation: Observation at this step.
-            action: Action taken at this step.
-            reasoning: Optional reasoning/thought from agent.
-        """
-        if self.current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-
-        # Serialize action
-        action_data = {
-            "type": action.type,
-            "x": action.x,
-            "y": action.y,
-            "target_node_id": action.target_node_id,
-            "text": action.text,
-            "key": action.key,
-        }
-
-        # Create step data
-        step = LiveStepData(
-            step_idx=step_idx,
-            action=action_data,
-            reasoning=reasoning,
-            screenshot_url=None,  # Could be populated if we serve screenshots
-        )
-
-        self.current_task.steps.append(step)
-
-        # Write updated state
-        self._write_state(
-            {
-                "status": "running",
-                "total_tasks": self.total_tasks,
-                "tasks_completed": self.tasks_completed,
-                "current_task": asdict(self.current_task),
-            }
-        )
-
-    def finish_task(self, result: BenchmarkResult) -> None:
-        """Finish tracking the current task.
-
-        Args:
-            result: The evaluation result for the task.
-        """
-        if self.current_task is None:
-            raise RuntimeError("No task started. Call start_task() first.")
-
-        # Add result to current task
-        self.current_task.result = {
-            "success": result.success,
-            "score": result.score,
-            "num_steps": result.num_steps,
-            "total_time_seconds": result.total_time_seconds,
-        }
-
-        # Increment completed count
-        self.tasks_completed += 1
-
-        # Write updated state
-        self._write_state(
-            {
-                "status": "running",
-                "total_tasks": self.total_tasks,
-                "tasks_completed": self.tasks_completed,
-                "current_task": asdict(self.current_task),
-            }
-        )
-
-        # Clear current task
-        self.current_task = None
-
-    def finish(self) -> None:
-        """Mark evaluation as complete."""
-        self._write_state(
-            {
-                "status": "complete",
-                "total_tasks": self.total_tasks,
-                "tasks_completed": self.tasks_completed,
-            }
-        )
-
-    def _write_state(self, state: dict[str, Any]) -> None:
-        """Write current state to JSON file.
-
-        Args:
-            state: State dictionary to write.
-        """
-        with open(self.output_file, "w") as f:
-            json.dump(state, f, indent=2)
+__all__ = [
+    "LiveEvaluationTracker",
+]
diff --git a/openadapt_ml/benchmarks/runner.py b/openadapt_ml/benchmarks/runner.py
index 320af27..b55c99d 100644
--- a/openadapt_ml/benchmarks/runner.py
+++ b/openadapt_ml/benchmarks/runner.py
@@ -1,432 +1,32 @@
-"""Evaluation runner for benchmarks.
+"""DEPRECATED: Import from openadapt_evals instead.
 
-This module provides functions to run agents on benchmarks and collect results.
-
-Example:
-    from openadapt_ml.benchmarks import WAAAdapter, PolicyAgent, evaluate_agent_on_benchmark
-
-    adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
-    agent = PolicyAgent(policy)
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
-
-    print(f"Success rate: {sum(r.success for r in results) / len(results):.1%}")
+This module is kept for backward compatibility only.
+All functions are now provided by openadapt_evals.benchmarks.runner.
 """
 
-from __future__ import annotations
+import warnings
 
-import logging
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import dataclass
-from typing import Callable
-
-from openadapt_ml.benchmarks.agent import BenchmarkAgent
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAdapter,
-    BenchmarkAction,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
+warnings.warn(
+    "openadapt_ml.benchmarks.runner is deprecated. "
+    "Please import from openadapt_evals instead: "
+    "from openadapt_evals import evaluate_agent_on_benchmark, compute_metrics",
+    DeprecationWarning,
+    stacklevel=2,
 )
-from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
-from openadapt_ml.benchmarks.live_tracker import LiveEvaluationTracker
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class EvaluationConfig:
-    """Configuration for benchmark evaluation.
-
-    Attributes:
-        max_steps: Maximum steps per task.
-        parallel: Number of parallel workers (if supported).
-        save_trajectories: Whether to save full trajectories in results.
-        verbose: Whether to print progress.
-        on_step: Optional callback called after each step.
-        on_task_complete: Optional callback called after each task.
-        save_execution_traces: Whether to save execution traces for viewer.
-        model_id: Model identifier for execution traces.
-        output_dir: Output directory for benchmark results.
-        run_name: Name for this evaluation run.
-        enable_live_tracking: Whether to enable live evaluation progress tracking.
-        live_tracking_file: Path to live tracking JSON file.
-    """
-
-    max_steps: int = 50
-    parallel: int = 1
-    save_trajectories: bool = True
-    verbose: bool = True
-    on_step: Callable[[BenchmarkObservation, BenchmarkAction, int], None] | None = None
-    on_task_complete: Callable[[BenchmarkResult], None] | None = None
-    save_execution_traces: bool = True
-    model_id: str = "unknown"
-    output_dir: str = "benchmark_results"
-    run_name: str | None = None
-    enable_live_tracking: bool = True
-    live_tracking_file: str = "benchmark_live.json"
-
-
-def evaluate_agent_on_benchmark(
-    agent: BenchmarkAgent,
-    adapter: BenchmarkAdapter,
-    task_ids: list[str] | None = None,
-    max_steps: int = 50,
-    parallel: int = 1,
-    config: EvaluationConfig | None = None,
-) -> list[BenchmarkResult]:
-    """Run agent on benchmark tasks and collect results.
-
-    Args:
-        agent: Agent to evaluate.
-        adapter: Benchmark adapter.
-        task_ids: Specific tasks to run (None = all tasks).
-        max_steps: Maximum steps per task (overridden by config if provided).
-        parallel: Number of parallel workers (overridden by config if provided).
-        config: Full evaluation configuration.
-
-    Returns:
-        List of BenchmarkResult for each task.
-    """
-    if config is None:
-        config = EvaluationConfig(max_steps=max_steps, parallel=parallel)
-
-    # Load tasks
-    if task_ids is not None:
-        tasks = [adapter.load_task(tid) for tid in task_ids]
-    else:
-        tasks = adapter.list_tasks()
-
-    if config.verbose:
-        logger.info(f"Evaluating {len(tasks)} tasks on {adapter.name}")
-
-    # Initialize execution trace collector if enabled
-    trace_collector = None
-    if config.save_execution_traces:
-        trace_collector = ExecutionTraceCollector(
-            benchmark_name=adapter.name,
-            run_name=config.run_name,
-            model_id=config.model_id,
-            output_dir=config.output_dir,
-        )
-        if config.verbose:
-            logger.info(f"Saving execution traces to: {trace_collector.run_dir}")
-
-    # Initialize live evaluation tracker if enabled
-    live_tracker = None
-    if config.enable_live_tracking:
-        live_tracker = LiveEvaluationTracker(
-            output_file=config.live_tracking_file,
-            total_tasks=len(tasks),
-        )
-        if config.verbose:
-            logger.info(f"Live tracking enabled: {config.live_tracking_file}")
-
-    # Run evaluation
-    if config.parallel > 1 and adapter.supports_parallel:
-        results = _evaluate_parallel(
-            agent, adapter, tasks, config, trace_collector, live_tracker
-        )
-    else:
-        results = _evaluate_sequential(
-            agent, adapter, tasks, config, trace_collector, live_tracker
-        )
-
-    # Save summary if trace collection is enabled
-    if trace_collector is not None:
-        trace_collector.save_summary(results)
-
-    # Mark live tracking as complete
-    if live_tracker is not None:
-        live_tracker.finish()
-
-    # Log summary
-    if config.verbose:
-        success_count = sum(1 for r in results if r.success)
-        success_rate = success_count / len(results) if results else 0
-        avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0
-        logger.info(
-            f"Evaluation complete: {success_count}/{len(results)} "
-            f"({success_rate:.1%}) success, {avg_steps:.1f} avg steps"
-        )
-
-    return results
-
-
-def _evaluate_sequential(
-    agent: BenchmarkAgent,
-    adapter: BenchmarkAdapter,
-    tasks: list[BenchmarkTask],
-    config: EvaluationConfig,
-    trace_collector: ExecutionTraceCollector | None = None,
-    live_tracker: LiveEvaluationTracker | None = None,
-) -> list[BenchmarkResult]:
-    """Run evaluation sequentially.
-
-    Args:
-        agent: Agent to evaluate.
-        adapter: Benchmark adapter.
-        tasks: Tasks to evaluate.
-        config: Evaluation configuration.
-        trace_collector: Optional trace collector for saving execution data.
-        live_tracker: Optional live evaluation tracker.
-
-    Returns:
-        List of results.
-    """
-    results = []
-    for i, task in enumerate(tasks):
-        if config.verbose:
-            logger.info(f"Task {i + 1}/{len(tasks)}: {task.task_id}")
-
-        result = _run_single_task(
-            agent, adapter, task, config, trace_collector, live_tracker
-        )
-        results.append(result)
-
-        if config.on_task_complete:
-            config.on_task_complete(result)
-
-    return results
-
-
-def _evaluate_parallel(
-    agent: BenchmarkAgent,
-    adapter: BenchmarkAdapter,
-    tasks: list[BenchmarkTask],
-    config: EvaluationConfig,
-    trace_collector: ExecutionTraceCollector | None = None,
-    live_tracker: LiveEvaluationTracker | None = None,
-) -> list[BenchmarkResult]:
-    """Run evaluation in parallel.
-
-    Note: This requires the adapter to support parallel execution
-    (e.g., via multiple VM instances).
-
-    Args:
-        agent: Agent to evaluate.
-        adapter: Benchmark adapter.
-        tasks: Tasks to evaluate.
-        config: Evaluation configuration.
-        trace_collector: Optional trace collector for saving execution data.
-        live_tracker: Optional live evaluation tracker.
-
-    Returns:
-        List of results.
-    """
-    results = []
-
-    with ThreadPoolExecutor(max_workers=config.parallel) as executor:
-        # Submit all tasks
-        future_to_task = {
-            executor.submit(
-                _run_single_task,
-                agent,
-                adapter,
-                task,
-                config,
-                trace_collector,
-                live_tracker,
-            ): task
-            for task in tasks
-        }
-
-        # Collect results as they complete
-        for future in as_completed(future_to_task):
-            task = future_to_task[future]
-            try:
-                result = future.result()
-                results.append(result)
-
-                if config.on_task_complete:
-                    config.on_task_complete(result)
-
-                if config.verbose:
-                    status = "SUCCESS" if result.success else "FAIL"
-                    logger.info(f"Task {task.task_id}: {status}")
 
-            except Exception as e:
-                logger.error(f"Task {task.task_id} failed with error: {e}")
-                results.append(
-                    BenchmarkResult(
-                        task_id=task.task_id,
-                        success=False,
-                        score=0.0,
-                        error=str(e),
-                    )
-                )
-
-    return results
-
-
-def _run_single_task(
-    agent: BenchmarkAgent,
-    adapter: BenchmarkAdapter,
-    task: BenchmarkTask,
-    config: EvaluationConfig,
-    trace_collector: ExecutionTraceCollector | None = None,
-    live_tracker: LiveEvaluationTracker | None = None,
-) -> BenchmarkResult:
-    """Run a single task and return result.
-
-    Args:
-        agent: Agent to evaluate.
-        adapter: Benchmark adapter.
-        task: Task to run.
-        config: Evaluation configuration.
-        trace_collector: Optional trace collector for saving execution data.
-        live_tracker: Optional live evaluation tracker.
-
-    Returns:
-        BenchmarkResult.
-    """
-    start_time = time.perf_counter()
-    history: list[tuple[BenchmarkObservation, BenchmarkAction]] = []
-
-    # Start trace collection if enabled
-    if trace_collector is not None:
-        trace_collector.start_task(task)
-
-    # Start live tracking if enabled
-    if live_tracker is not None:
-        live_tracker.start_task(task)
-
-    try:
-        # Reset agent and environment
-        agent.reset()
-        obs = adapter.reset(task)
-
-        done = False
-        steps = 0
-        max_steps = task.time_limit_steps or config.max_steps
-
-        while not done and steps < max_steps:
-            # Get action from agent
-            action = agent.act(obs, task, history if config.save_trajectories else None)
-
-            # Extract reasoning if available from PolicyAgent
-            reasoning = None
-            if hasattr(action, "raw_action") and action.raw_action:
-                reasoning = action.raw_action.get("thought")
-
-            # Record step in trace collector
-            if trace_collector is not None:
-                trace_collector.record_step(steps, obs, action, reasoning)
-
-            # Record step in live tracker
-            if live_tracker is not None:
-                live_tracker.record_step(steps, obs, action, reasoning)
-
-            # Record step in history
-            if config.save_trajectories:
-                history.append((obs, action))
-
-            if config.on_step:
-                config.on_step(obs, action, steps)
-
-            # Check for terminal action
-            if action.type == "done":
-                done = True
-                break
-
-            # Execute action
-            obs, done, info = adapter.step(action)
-            steps += 1
-
-        # Evaluate result
-        result = adapter.evaluate(task)
-
-        # Update result with trajectory info
-        result.steps = history if config.save_trajectories else []
-        result.num_steps = steps
-        result.total_time_seconds = time.perf_counter() - start_time
-
-        # Finish trace collection if enabled
-        if trace_collector is not None:
-            trace_collector.finish_task(result)
-
-        # Finish live tracking if enabled
-        if live_tracker is not None:
-            live_tracker.finish_task(result)
-
-        return result
-
-    except Exception as e:
-        logger.error(f"Error running task {task.task_id}: {e}")
-        result = BenchmarkResult(
-            task_id=task.task_id,
-            success=False,
-            score=0.0,
-            steps=history if config.save_trajectories else [],
-            num_steps=len(history),
-            error=str(e),
-            total_time_seconds=time.perf_counter() - start_time,
-        )
-
-        # Finish trace collection even on error
-        if trace_collector is not None:
-            trace_collector.finish_task(result)
-
-        return result
-
-
-def compute_metrics(results: list[BenchmarkResult]) -> dict:
-    """Compute aggregate metrics from evaluation results.
-
-    Args:
-        results: List of BenchmarkResult from evaluation.
-
-    Returns:
-        Dict with aggregate metrics.
-    """
-    if not results:
-        return {
-            "num_tasks": 0,
-            "success_rate": 0.0,
-            "avg_score": 0.0,
-            "avg_steps": 0.0,
-            "avg_time_seconds": 0.0,
-        }
-
-    num_tasks = len(results)
-    success_count = sum(1 for r in results if r.success)
-    total_score = sum(r.score for r in results)
-    total_steps = sum(r.num_steps for r in results)
-    total_time = sum(r.total_time_seconds for r in results)
-
-    return {
-        "num_tasks": num_tasks,
-        "success_rate": success_count / num_tasks,
-        "avg_score": total_score / num_tasks,
-        "avg_steps": total_steps / num_tasks,
-        "avg_time_seconds": total_time / num_tasks,
-        "success_count": success_count,
-        "fail_count": num_tasks - success_count,
-    }
-
-
-def compute_domain_metrics(
-    results: list[BenchmarkResult], tasks: list[BenchmarkTask]
-) -> dict[str, dict]:
-    """Compute per-domain metrics.
-
-    Args:
-        results: List of BenchmarkResult.
-        tasks: List of BenchmarkTask (to get domain info).
-
-    Returns:
-        Dict mapping domain to metrics dict.
-    """
-    # Build task_id -> domain mapping
-    task_domains = {t.task_id: t.domain for t in tasks}
-
-    # Group results by domain
-    domain_results: dict[str, list[BenchmarkResult]] = {}
-    for result in results:
-        domain = task_domains.get(result.task_id, "unknown")
-        if domain not in domain_results:
-            domain_results[domain] = []
-        domain_results[domain].append(result)
+# Re-export from canonical location
+from openadapt_evals.benchmarks.runner import (
+    EvaluationConfig,
+    evaluate_agent_on_benchmark,
+)
+from openadapt_evals.benchmarks import (
+    compute_domain_metrics,
+    compute_metrics,
+)
 
-    # Compute metrics per domain
-    return {domain: compute_metrics(res) for domain, res in domain_results.items()}
+__all__ = [
+    "EvaluationConfig",
+    "compute_domain_metrics",
+    "compute_metrics",
+    "evaluate_agent_on_benchmark",
+]
diff --git a/openadapt_ml/benchmarks/waa.py b/openadapt_ml/benchmarks/waa.py
index 120a954..b42004f 100644
--- a/openadapt_ml/benchmarks/waa.py
+++ b/openadapt_ml/benchmarks/waa.py
@@ -1,831 +1,30 @@
-"""Windows Agent Arena (WAA) benchmark adapter.
+"""DEPRECATED: Import from openadapt_evals instead.
 
-This module provides integration with the Windows Agent Arena benchmark,
-enabling evaluation of GUI agents on 154 Windows tasks across 11 domains.
-
-WAA Repository: https://github.com/microsoft/WindowsAgentArena
-
-Example:
-    from openadapt_ml.benchmarks import WAAAdapter, PolicyAgent, evaluate_agent_on_benchmark
-
-    adapter = WAAAdapter(waa_repo_path="/path/to/WindowsAgentArena")
-    agent = PolicyAgent(policy)
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=15)
-    print(f"Success rate: {sum(r.success for r in results) / len(results):.1%}")
+This module is kept for backward compatibility only.
+All classes are now provided by openadapt_evals.adapters.waa.
 """
 
-from __future__ import annotations
-
-import json
-import logging
-import sys
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any
+import warnings
 
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkAdapter,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
+warnings.warn(
+    "openadapt_ml.benchmarks.waa is deprecated. "
+    "Please import from openadapt_evals instead: "
+    "from openadapt_evals import WAAAdapter, WAAMockAdapter, WAAConfig",
+    DeprecationWarning,
+    stacklevel=2,
 )
 
-logger = logging.getLogger(__name__)
-
+# Re-export from canonical location
+from openadapt_evals.adapters.waa import (
+    WAA_DOMAINS,
+    WAAAdapter,
+    WAAConfig,
+    WAAMockAdapter,
+)
 
-# WAA domain mapping (11 domains, 154 tasks)
-WAA_DOMAINS = [
-    "browser",
-    "office",
-    "coding",
-    "media",
-    "notepad",
-    "paint",
-    "file_explorer",
-    "clock",
-    "settings",
-    "edge",
-    "vscode",
+__all__ = [
+    "WAA_DOMAINS",
+    "WAAAdapter",
+    "WAAConfig",
+    "WAAMockAdapter",
 ]
-
-
-@dataclass
-class WAAConfig:
-    """Configuration for WAA adapter.
-
-    Attributes:
-        waa_repo_path: Path to cloned WindowsAgentArena repository.
-        use_azure: Whether to use Azure VMs (enables parallelism).
-        observation_type: Type of observation to capture.
-        a11y_backend: Accessibility backend ("uia" or "win32").
-        screen_width: Screen width in pixels.
-        screen_height: Screen height in pixels.
-        max_steps: Default maximum steps per task.
-        action_delay: Delay between actions in seconds.
-    """
-
-    waa_repo_path: str
-    use_azure: bool = False
-    observation_type: str = "screenshot_a11y_tree"  # "screenshot", "a11y_tree", "som"
-    a11y_backend: str = "uia"  # "uia" or "win32"
-    screen_width: int = 1920
-    screen_height: int = 1200
-    max_steps: int = 15
-    action_delay: float = 0.5
-
-
-class WAAAdapter(BenchmarkAdapter):
-    """Windows Agent Arena benchmark adapter.
-
-    Integrates with the WAA benchmark to evaluate GUI agents on 154 Windows
-    desktop automation tasks spanning 11 application domains.
-
-    The adapter wraps WAA's DesktopEnv and provides:
-    - Task loading from WAA's JSON task definitions
-    - VM/environment reset to task initial state
-    - Action execution via WAA's controller
-    - Evaluation using WAA's native evaluators
-
-    Args:
-        waa_repo_path: Path to cloned WindowsAgentArena repository.
-        use_azure: Use Azure VMs for execution (enables parallelism).
-        config: Full WAAConfig (overrides other args if provided).
-        **kwargs: Additional config options passed to WAAConfig.
-
-    Raises:
-        ValueError: If waa_repo_path doesn't exist.
-        ImportError: If WAA dependencies not available.
-    """
-
-    def __init__(
-        self,
-        waa_repo_path: str | Path | None = None,
-        use_azure: bool = False,
-        config: WAAConfig | None = None,
-        **kwargs,
-    ):
-        if config is not None:
-            self.config = config
-        else:
-            if waa_repo_path is None:
-                raise ValueError("waa_repo_path is required")
-            self.config = WAAConfig(
-                waa_repo_path=str(waa_repo_path),
-                use_azure=use_azure,
-                **kwargs,
-            )
-
-        self.waa_repo = Path(self.config.waa_repo_path)
-        if not self.waa_repo.exists():
-            raise ValueError(f"WAA repository not found at: {self.waa_repo}")
-
-        # Paths to WAA components
-        self._client_path = self.waa_repo / "src" / "win-arena-container" / "client"
-        self._tasks_path = self._client_path / "evaluation_examples_windows"
-
-        # Lazy-loaded WAA components
-        self._desktop_env = None
-        self._task_cache: dict[str, BenchmarkTask] = {}
-        self._current_task: BenchmarkTask | None = None
-        self._waa_imported = False
-
-    def _ensure_waa_imported(self) -> None:
-        """Import WAA modules (lazy loading)."""
-        if self._waa_imported:
-            return
-
-        # Add WAA client to path
-        client_path = str(self._client_path)
-        if client_path not in sys.path:
-            sys.path.insert(0, client_path)
-
-        try:
-            # Import WAA's DesktopEnv
-            from desktop_env import DesktopEnv
-
-            self._DesktopEnv = DesktopEnv
-            self._waa_imported = True
-            logger.info("WAA modules imported successfully")
-        except ImportError as e:
-            raise ImportError(
-                f"Failed to import WAA modules. Ensure WAA is properly installed "
-                f"and dependencies are available: {e}"
-            ) from e
-
-    @property
-    def name(self) -> str:
-        """Benchmark name."""
-        return "waa"
-
-    @property
-    def benchmark_type(self) -> str:
-        """Benchmark type (interactive)."""
-        return "interactive"
-
-    @property
-    def supports_parallel(self) -> bool:
-        """Whether parallel execution is supported (requires Azure)."""
-        return self.config.use_azure
-
-    def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
-        """List available WAA tasks.
-
-        WAA has 154 tasks across 11 domains:
-        - browser: Edge/Chrome navigation and settings
-        - office: Word, Excel, Outlook
-        - coding: VSCode, terminal
-        - settings: Windows Settings app
-        - file_explorer: File operations
-        - notepad: Text editing
-        - paint: Drawing operations
-        - media: Video/audio playback
-        - clock: Alarms, timers
-        - edge: Browser-specific
-        - vscode: IDE-specific
-
-        Args:
-            domain: Optional domain filter.
-
-        Returns:
-            List of BenchmarkTask objects.
-        """
-        tasks = self._load_all_tasks()
-
-        if domain is not None:
-            tasks = [t for t in tasks if t.domain == domain]
-
-        return tasks
-
-    def load_task(self, task_id: str) -> BenchmarkTask:
-        """Load a specific task by ID.
-
-        Args:
-            task_id: Task identifier (e.g., "notepad_1", "browser_5").
-
-        Returns:
-            BenchmarkTask object.
-
-        Raises:
-            KeyError: If task_id not found.
-        """
-        if task_id in self._task_cache:
-            return self._task_cache[task_id]
-
-        # Try to load from disk
-        tasks = self._load_all_tasks()
-        task_map = {t.task_id: t for t in tasks}
-
-        if task_id not in task_map:
-            raise KeyError(
-                f"Task '{task_id}' not found. Available: {list(task_map.keys())[:10]}..."
-            )
-
-        return task_map[task_id]
-
-    def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
-        """Reset environment to task's initial state.
-
-        This initializes the Windows VM/desktop to the state required for
-        the task, including opening required applications and setting up
-        any pre-conditions.
-
-        Args:
-            task: Task to initialize.
-
-        Returns:
-            Initial observation (screenshot + accessibility tree).
-        """
-        self._ensure_waa_imported()
-        self._current_task = task
-
-        # Initialize DesktopEnv if needed
-        if self._desktop_env is None:
-            self._desktop_env = self._create_desktop_env()
-
-        # Load task config and reset environment
-        task_config = self._load_waa_task_config(task)
-        obs = self._desktop_env.reset(task_config=task_config)
-
-        return self._to_benchmark_observation(obs)
-
-    def step(
-        self, action: BenchmarkAction
-    ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
-        """Execute action and return new observation.
-
-        Args:
-            action: Action to execute.
-
-        Returns:
-            Tuple of (observation, done, info).
-        """
-        if self._desktop_env is None:
-            raise RuntimeError("Call reset() before step()")
-
-        # Convert to WAA action format
-        waa_action = self._to_waa_action(action)
-
-        # Execute action
-        obs, reward, done, info = self._desktop_env.step(waa_action)
-
-        # Optional delay between actions
-        if self.config.action_delay > 0:
-            time.sleep(self.config.action_delay)
-
-        return self._to_benchmark_observation(obs), done, info
-
-    def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Run WAA's native evaluation on current state.
-
-        WAA evaluators check the actual OS state (files, settings, app state)
-        to determine if the task was completed successfully.
-
-        Args:
-            task: Task to evaluate.
-
-        Returns:
-            BenchmarkResult with success/score.
-        """
-        if self._desktop_env is None:
-            raise RuntimeError("Call reset() and step() before evaluate()")
-
-        # Run WAA's evaluator
-        try:
-            result = self._desktop_env.evaluate()
-            success = result.get("success", False)
-            score = 1.0 if success else 0.0
-            reason = result.get("reason", None)
-        except Exception as e:
-            logger.error(f"Evaluation failed for task {task.task_id}: {e}")
-            success = False
-            score = 0.0
-            reason = str(e)
-
-        return BenchmarkResult(
-            task_id=task.task_id,
-            success=success,
-            score=score,
-            reason=reason,
-        )
-
-    def close(self) -> None:
-        """Clean up resources."""
-        if self._desktop_env is not None:
-            try:
-                self._desktop_env.close()
-            except Exception as e:
-                logger.warning(f"Error closing DesktopEnv: {e}")
-            self._desktop_env = None
-
-    def _create_desktop_env(self):
-        """Create WAA DesktopEnv instance."""
-        require_a11y = self.config.observation_type in [
-            "a11y_tree",
-            "screenshot_a11y_tree",
-            "som",
-        ]
-
-        return self._DesktopEnv(
-            screen_size=(self.config.screen_width, self.config.screen_height),
-            require_a11y_tree=require_a11y,
-            a11y_backend=self.config.a11y_backend,
-        )
-
-    def _load_all_tasks(self) -> list[BenchmarkTask]:
-        """Load all WAA tasks from the repository."""
-        if self._task_cache:
-            return list(self._task_cache.values())
-
-        tasks = []
-
-        # Load test_all.json metadata
-        meta_path = self._tasks_path / "test_all.json"
-        if meta_path.exists():
-            with open(meta_path, encoding="utf-8") as f:
-                meta = json.load(f)
-
-            for domain, task_ids in meta.items():
-                if domain in WAA_DOMAINS:
-                    for task_id in task_ids:
-                        task = self._load_task_from_json(domain, task_id)
-                        if task:
-                            tasks.append(task)
-                            self._task_cache[task.task_id] = task
-        else:
-            # Fallback: scan examples directory
-            examples_dir = self._tasks_path / "examples"
-            if examples_dir.exists():
-                for domain_dir in examples_dir.iterdir():
-                    if domain_dir.is_dir() and domain_dir.name in WAA_DOMAINS:
-                        for task_file in domain_dir.glob("*.json"):
-                            task = self._load_task_from_file(task_file, domain_dir.name)
-                            if task:
-                                tasks.append(task)
-                                self._task_cache[task.task_id] = task
-
-        logger.info(f"Loaded {len(tasks)} WAA tasks")
-        return tasks
-
-    def _load_task_from_json(self, domain: str, task_id: str) -> BenchmarkTask | None:
-        """Load a task from its JSON file."""
-        task_file = self._tasks_path / "examples" / domain / f"{task_id}.json"
-        if not task_file.exists():
-            logger.warning(f"Task file not found: {task_file}")
-            return None
-
-        return self._load_task_from_file(task_file, domain)
-
-    def _load_task_from_file(
-        self, task_file: Path, domain: str
-    ) -> BenchmarkTask | None:
-        """Load a task from a JSON file."""
-        try:
-            with open(task_file, encoding="utf-8") as f:
-                config = json.load(f)
-
-            task_id = f"{domain}_{task_file.stem}"
-            instruction = config.get("instruction", config.get("task", ""))
-
-            return BenchmarkTask(
-                task_id=task_id,
-                instruction=instruction,
-                domain=domain,
-                initial_state_ref=config.get("snapshot", None),
-                time_limit_steps=config.get("max_steps", self.config.max_steps),
-                raw_config=config,
-                evaluation_spec=config.get("evaluation", None),
-            )
-        except Exception as e:
-            logger.warning(f"Failed to load task from {task_file}: {e}")
-            return None
-
-    def _load_waa_task_config(self, task: BenchmarkTask) -> dict:
-        """Convert BenchmarkTask to WAA's task config format."""
-        return task.raw_config
-
-    def _to_benchmark_observation(self, waa_obs: dict | Any) -> BenchmarkObservation:
-        """Convert WAA observation to canonical format.
-
-        WAA observations may include:
-        - screenshot: PIL Image or bytes
-        - a11y_tree: UIA accessibility tree dict
-        - window_title: Active window title
-        """
-        # Handle different WAA observation formats
-        if isinstance(waa_obs, dict):
-            screenshot = waa_obs.get("screenshot")
-            a11y_tree = waa_obs.get("a11y_tree", waa_obs.get("accessibility_tree"))
-            window_title = waa_obs.get("window_title")
-            raw_obs = waa_obs
-        else:
-            # WAA may return observation as object with attributes
-            screenshot = getattr(waa_obs, "screenshot", None)
-            a11y_tree = getattr(waa_obs, "a11y_tree", None)
-            window_title = getattr(waa_obs, "window_title", None)
-            raw_obs = {"waa_obs_type": type(waa_obs).__name__}
-
-        # Convert PIL Image to bytes if needed
-        screenshot_bytes = None
-        if screenshot is not None:
-            if hasattr(screenshot, "tobytes"):
-                # PIL Image - convert to PNG bytes
-                import io
-
-                buf = io.BytesIO()
-                screenshot.save(buf, format="PNG")
-                screenshot_bytes = buf.getvalue()
-            elif isinstance(screenshot, bytes):
-                screenshot_bytes = screenshot
-
-        return BenchmarkObservation(
-            screenshot=screenshot_bytes,
-            viewport=(self.config.screen_width, self.config.screen_height),
-            accessibility_tree=a11y_tree,
-            window_title=window_title,
-            raw_observation=raw_obs,
-        )
-
-    def _to_waa_action(self, action: BenchmarkAction) -> dict:
-        """Convert canonical action to WAA format.
-
-        WAA action format:
-        - click: {"action_type": "click", "coordinate": [x, y]}
-        - double_click: {"action_type": "double_click", "coordinate": [x, y]}
-        - type: {"action_type": "type", "text": "..."}
-        - key: {"action_type": "key", "key": "...", "modifiers": [...]}
-        - scroll: {"action_type": "scroll", "direction": "...", "amount": ...}
-        - drag: {"action_type": "drag", "start": [x, y], "end": [x, y]}
-        """
-        action_type = action.type
-
-        # Map canonical action types to WAA format
-        if action_type == "click":
-            x = action.x or 0
-            y = action.y or 0
-            # Convert normalized coords to pixels if needed
-            if 0 <= x <= 1 and 0 <= y <= 1:
-                x = int(x * self.config.screen_width)
-                y = int(y * self.config.screen_height)
-            return {
-                "action_type": "click",
-                "coordinate": [int(x), int(y)],
-            }
-
-        elif action_type == "double_click":
-            x = action.x or 0
-            y = action.y or 0
-            if 0 <= x <= 1 and 0 <= y <= 1:
-                x = int(x * self.config.screen_width)
-                y = int(y * self.config.screen_height)
-            return {
-                "action_type": "double_click",
-                "coordinate": [int(x), int(y)],
-            }
-
-        elif action_type == "right_click":
-            x = action.x or 0
-            y = action.y or 0
-            if 0 <= x <= 1 and 0 <= y <= 1:
-                x = int(x * self.config.screen_width)
-                y = int(y * self.config.screen_height)
-            return {
-                "action_type": "right_click",
-                "coordinate": [int(x), int(y)],
-            }
-
-        elif action_type == "type":
-            return {
-                "action_type": "type",
-                "text": action.text or "",
-            }
-
-        elif action_type == "key":
-            waa_action = {
-                "action_type": "key",
-                "key": action.key or "",
-            }
-            if action.modifiers:
-                waa_action["modifiers"] = action.modifiers
-            return waa_action
-
-        elif action_type == "scroll":
-            return {
-                "action_type": "scroll",
-                "direction": action.scroll_direction or "down",
-                "amount": action.scroll_amount or 3,  # Default scroll amount
-            }
-
-        elif action_type == "drag":
-            x1 = action.x or 0
-            y1 = action.y or 0
-            x2 = action.end_x or 0
-            y2 = action.end_y or 0
-            # Convert normalized coords
-            if 0 <= x1 <= 1:
-                x1 = int(x1 * self.config.screen_width)
-                y1 = int(y1 * self.config.screen_height)
-            if 0 <= x2 <= 1:
-                x2 = int(x2 * self.config.screen_width)
-                y2 = int(y2 * self.config.screen_height)
-            return {
-                "action_type": "drag",
-                "start": [int(x1), int(y1)],
-                "end": [int(x2), int(y2)],
-            }
-
-        elif action_type == "done":
-            return {"action_type": "done"}
-
-        elif action_type == "wait":
-            return {"action_type": "wait"}
-
-        else:
-            logger.warning(f"Unknown action type: {action_type}")
-            return {"action_type": action_type, "raw": action.raw_action}
-
-
-class WAAMockAdapter(BenchmarkAdapter):
-    """Mock WAA adapter for testing without Windows VM.
-
-    Useful for:
-    - Testing the benchmark integration without actual WAA
-    - Development on non-Windows platforms
-    - Unit tests
-
-    Args:
-        num_tasks: Number of mock tasks to generate.
-        domains: Domains to include in mock tasks.
-    """
-
-    def __init__(
-        self,
-        num_tasks: int = 20,
-        domains: list[str] | None = None,
-    ):
-        self._num_tasks = num_tasks
-        self._domains = domains or WAA_DOMAINS[:3]  # Default to first 3 domains
-        self._tasks: list[BenchmarkTask] = []
-        self._current_task: BenchmarkTask | None = None
-        self._step_count = 0
-        self._temp_dir: Path | None = None
-        self._actions: list[BenchmarkAction] = []  # Track actions for evaluation
-        self._text_entered: str | None = None  # Track typed text
-        self._generate_mock_tasks()
-
-    @property
-    def name(self) -> str:
-        return "waa-mock"
-
-    @property
-    def benchmark_type(self) -> str:
-        return "interactive"
-
-    def _generate_mock_tasks(self) -> None:
-        """Generate mock tasks for testing."""
-        tasks_per_domain = self._num_tasks // len(self._domains)
-        extra = self._num_tasks % len(self._domains)
-
-        for i, domain in enumerate(self._domains):
-            count = tasks_per_domain + (1 if i < extra else 0)
-            for j in range(count):
-                task_id = f"{domain}_{j + 1}"
-                self._tasks.append(
-                    BenchmarkTask(
-                        task_id=task_id,
-                        instruction=f"Mock task {j + 1} in {domain} domain",
-                        domain=domain,
-                        time_limit_steps=15,
-                        raw_config={"mock": True},
-                    )
-                )
-
-    def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
-        if domain is not None:
-            return [t for t in self._tasks if t.domain == domain]
-        return self._tasks
-
-    def load_task(self, task_id: str) -> BenchmarkTask:
-        for task in self._tasks:
-            if task.task_id == task_id:
-                return task
-        raise KeyError(f"Task '{task_id}' not found")
-
-    def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
-        self._current_task = task
-        self._step_count = 0
-        self._actions = []  # Clear action history
-        self._text_entered = None
-        return self._mock_observation()
-
-    def step(
-        self, action: BenchmarkAction
-    ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
-        self._step_count += 1
-        self._actions.append(action)  # Track action for evaluation
-
-        # Track typed text
-        if action.type == "type" and action.text:
-            self._text_entered = action.text
-
-        done = action.type == "done" or self._step_count >= 15
-        return self._mock_observation(), done, {"step": self._step_count}
-
-    def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Evaluate task based on actions taken.
-
-        Success criteria for mock tasks:
-        - Agent clicked the Submit button (ID 4) OR
-        - Agent typed text AND clicked OK (ID 1) OR
-        - Agent completed with DONE action after meaningful interaction
-
-        This provides deterministic evaluation based on actual agent behavior,
-        not random chance. The mock UI has:
-        - ID 1: OK button
-        - ID 2: Text input field
-        - ID 3: Cancel button
-        - ID 4: Submit button
-        """
-        # Check what actions were taken
-        clicked_ids = set()
-        typed_text = False
-        called_done = False
-
-        for action in self._actions:
-            if action.type == "click":
-                # Extract target node ID from action
-                target_id = getattr(action, "target_node_id", None)
-                if target_id:
-                    clicked_ids.add(str(target_id))
-            elif action.type == "type" and action.text:
-                typed_text = True
-            elif action.type == "done":
-                called_done = True
-
-        # Success criteria:
-        # 1. Clicked Submit (ID 4) - primary success path
-        # 2. Typed something AND clicked OK (ID 1) - form submission path
-        # 3. Called DONE after at least 2 actions - reasonable completion
-        clicked_submit = "4" in clicked_ids
-        clicked_ok = "1" in clicked_ids
-        form_submitted = typed_text and clicked_ok
-        reasonable_completion = called_done and len(self._actions) >= 2
-
-        success = clicked_submit or form_submitted or reasonable_completion
-
-        # Calculate partial credit score
-        score = 0.0
-        if success:
-            score = 1.0
-        elif typed_text or clicked_ids:
-            # Partial credit for taking meaningful actions
-            score = (
-                0.3 + (0.1 * min(len(clicked_ids), 3)) + (0.2 if typed_text else 0.0)
-            )
-
-        return BenchmarkResult(
-            task_id=task.task_id,
-            success=success,
-            score=score,
-            num_steps=self._step_count,
-            reason=f"clicked={list(clicked_ids)}, typed={typed_text}, done={called_done}",
-        )
-
-    def _mock_observation(self) -> BenchmarkObservation:
-        """Generate a mock observation with a real screenshot file."""
-        import tempfile
-
-        # Create temp directory if needed
-        if self._temp_dir is None:
-            self._temp_dir = Path(tempfile.mkdtemp(prefix="waa_mock_"))
-
-        # Generate a simple mock screenshot (gray image with text)
-        screenshot_path = self._temp_dir / f"mock_step_{self._step_count}.png"
-        self._generate_mock_screenshot(screenshot_path)
-
-        return BenchmarkObservation(
-            screenshot=screenshot_path.read_bytes(),
-            screenshot_path=str(screenshot_path),
-            viewport=(1920, 1200),
-            accessibility_tree={
-                "role": "window",
-                "name": "Mock Window",
-                "children": [
-                    {"role": "button", "name": "OK", "id": "1"},
-                    {"role": "textfield", "name": "Input", "id": "2"},
-                    {"role": "button", "name": "Cancel", "id": "3"},
-                    {"role": "button", "name": "Submit", "id": "4"},
-                ],
-            },
-            window_title="Mock Window - Testing",
-        )
-
-    def _generate_mock_screenshot(self, path: Path) -> None:
-        """Generate a simple mock screenshot image."""
-        try:
-            from PIL import Image, ImageDraw
-
-            # Create a simple gray image with some UI elements
-            img = Image.new("RGB", (1920, 1200), color=(240, 240, 240))
-            draw = ImageDraw.Draw(img)
-
-            # Draw a title bar
-            draw.rectangle([0, 0, 1920, 40], fill=(60, 60, 60))
-            draw.text((20, 10), "Mock Application Window", fill=(255, 255, 255))
-
-            # Draw some buttons
-            draw.rectangle([100, 100, 200, 140], fill=(0, 120, 215))
-            draw.text((120, 110), "OK", fill=(255, 255, 255))
-
-            draw.rectangle([220, 100, 320, 140], fill=(200, 200, 200))
-            draw.text((240, 110), "Cancel", fill=(0, 0, 0))
-
-            # Draw a text field
-            draw.rectangle([100, 160, 500, 200], outline=(100, 100, 100))
-            draw.text((110, 170), "Enter text here...", fill=(150, 150, 150))
-
-            # Draw task instruction
-            task_name = self._current_task.task_id if self._current_task else "Unknown"
-            draw.text((100, 250), f"Task: {task_name}", fill=(0, 0, 0))
-            draw.text((100, 280), f"Step: {self._step_count}", fill=(0, 0, 0))
-
-            img.save(path)
-        except ImportError:
-            # Fallback: create a minimal valid PNG if PIL not available
-            # This is a 1x1 gray PNG
-            minimal_png = bytes(
-                [
-                    0x89,
-                    0x50,
-                    0x4E,
-                    0x47,
-                    0x0D,
-                    0x0A,
-                    0x1A,
-                    0x0A,  # PNG signature
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x0D,
-                    0x49,
-                    0x48,
-                    0x44,
-                    0x52,  # IHDR chunk
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x01,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x01,
-                    0x08,
-                    0x02,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x90,
-                    0x77,
-                    0x53,
-                    0xDE,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x0C,
-                    0x49,
-                    0x44,
-                    0x41,  # IDAT chunk
-                    0x54,
-                    0x08,
-                    0xD7,
-                    0x63,
-                    0xF8,
-                    0xCF,
-                    0xC0,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x03,
-                    0x00,
-                    0x01,
-                    0x00,
-                    0x05,
-                    0xFE,
-                    0xD4,
-                    0xEF,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x00,
-                    0x49,
-                    0x45,  # IEND chunk
-                    0x4E,
-                    0x44,
-                    0xAE,
-                    0x42,
-                    0x60,
-                    0x82,
-                ]
-            )
-            path.write_bytes(minimal_png)
diff --git a/openadapt_ml/benchmarks/waa_live.py b/openadapt_ml/benchmarks/waa_live.py
index 4dfd0d4..70c7a3d 100644
--- a/openadapt_ml/benchmarks/waa_live.py
+++ b/openadapt_ml/benchmarks/waa_live.py
@@ -1,623 +1,26 @@
-"""Windows Agent Arena Live adapter.
+"""DEPRECATED: Import from openadapt_evals instead.
 
-This module provides a live HTTP-based adapter for WAA that connects to the
-WAA Flask server running inside a Windows VM. Unlike WAAAdapter which imports
-WAA's DesktopEnv locally, this adapter talks to the server remotely.
-
-Architecture:
-    The adapter uses WAA's element-based execution model:
-    1. Fetch accessibility tree from /accessibility endpoint
-    2. Extract element bboxes and POST to /update_computer as rects dict
-    3. Agent outputs actions with target_node_id (element-based grounding)
-    4. Execute via /execute_windows using computer.mouse.move_id(id) commands
-
-    This keeps grounding authority on WAA side - we send element IDs,
-    not pixel coordinates. WAA's Computer class handles the grounding.
-
-Example:
-    from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
-
-    adapter = WAALiveAdapter(WAALiveConfig(server_url="http://vm-ip:5000"))
-    agent = DemoConditionedAgent(base_agent, retriever)
-    results = evaluate_agent_on_benchmark(agent, adapter, max_steps=15)
+This module is kept for backward compatibility only.
+All classes are now provided by openadapt_evals.adapters.waa_live.
 """
 
-from __future__ import annotations
+import warnings
 
-import base64
-import logging
-import time
-from dataclasses import dataclass
-from typing import Any
-
-import requests
-
-from openadapt_ml.benchmarks.base import (
-    BenchmarkAction,
-    BenchmarkAdapter,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
+warnings.warn(
+    "openadapt_ml.benchmarks.waa_live is deprecated. "
+    "Please import from openadapt_evals instead: "
+    "from openadapt_evals import WAALiveAdapter, WAALiveConfig",
+    DeprecationWarning,
+    stacklevel=2,
 )
 
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class WAALiveConfig:
-    """Configuration for WAALiveAdapter.
-
-    Attributes:
-        server_url: URL of WAA Flask server (e.g., "http://172.171.112.41:5000").
-        a11y_backend: Accessibility backend ("uia" or "win32").
-        screen_width: Screen width in pixels.
-        screen_height: Screen height in pixels.
-        max_steps: Default maximum steps per task.
-        action_delay: Delay after actions in seconds (for UI to settle).
-        timeout: Request timeout in seconds.
-    """
-
-    server_url: str = "http://localhost:5000"
-    a11y_backend: str = "uia"
-    screen_width: int = 1920
-    screen_height: int = 1200
-    max_steps: int = 15
-    action_delay: float = 0.5
-    timeout: float = 90.0
-
-
-class WAALiveAdapter(BenchmarkAdapter):
-    """Live WAA adapter that connects to WAA Flask server over HTTP.
-
-    Unlike WAAAdapter which imports WAA's DesktopEnv locally, this adapter
-    talks to the WAA server remotely via HTTP. This enables:
-    - Running DemoConditionedAgent from local machine
-    - Using our own VLM (Claude/GPT) instead of WAA's built-in navi agent
-    - Injecting demos into prompts before each action
-
-    Args:
-        config: WAALiveConfig with server URL and settings.
-    """
-
-    def __init__(self, config: WAALiveConfig | None = None):
-        self.config = config or WAALiveConfig()
-        self._current_task: BenchmarkTask | None = None
-        self._step_count = 0
-        self._current_a11y: dict | None = None
-        self._current_rects: dict[str, list[int]] = {}  # element_id -> [l, t, r, b]
-        self._current_screenshot: bytes | None = None
-        self._actions: list[BenchmarkAction] = []
-
-    @property
-    def name(self) -> str:
-        """Benchmark name."""
-        return "waa-live"
-
-    @property
-    def benchmark_type(self) -> str:
-        """Benchmark type (interactive)."""
-        return "interactive"
-
-    @property
-    def supports_parallel(self) -> bool:
-        """Whether parallel execution is supported."""
-        return False  # Single VM for now
-
-    def check_connection(self) -> bool:
-        """Check if WAA server is reachable.
-
-        Returns:
-            True if server responds to /probe endpoint.
-        """
-        try:
-            resp = requests.get(f"{self.config.server_url}/probe", timeout=5.0)
-            return resp.status_code == 200
-        except requests.RequestException:
-            return False
-
-    def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
-        """List available WAA tasks.
-
-        For live adapter, tasks are typically loaded on-demand.
-        Returns empty list - use load_task() directly.
-        """
-        return []
-
-    def load_task(self, task_id: str) -> BenchmarkTask:
-        """Load a specific task by ID.
-
-        Args:
-            task_id: Task identifier.
-
-        Returns:
-            BenchmarkTask object.
-        """
-        # For now, create a minimal task - actual task configs should be
-        # loaded from WAA repo if needed
-        return BenchmarkTask(
-            task_id=task_id,
-            instruction=f"Task {task_id}",
-            domain=task_id.split("_")[0] if "_" in task_id else "unknown",
-            time_limit_steps=self.config.max_steps,
-        )
-
-    def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
-        """Reset environment to task's initial state.
-
-        Args:
-            task: Task to initialize.
-
-        Returns:
-            Initial observation (screenshot + accessibility tree).
-
-        Raises:
-            RuntimeError: If server is not reachable.
-        """
-        if not self.check_connection():
-            raise RuntimeError(
-                f"Cannot connect to WAA server at {self.config.server_url}. "
-                f"Ensure Windows VM is running and server is started."
-            )
-
-        self._current_task = task
-        self._step_count = 0
-        self._actions = []
-
-        # Try to close all windows for clean state
-        try:
-            requests.post(f"{self.config.server_url}/setup/close_all", timeout=30.0)
-            logger.info("Closed all windows for clean state")
-        except requests.RequestException as e:
-            logger.warning(f"Failed to close windows: {e}")
-
-        # If task has setup commands in raw_config, execute them
-        if task.raw_config:
-            self._run_task_setup(task.raw_config)
-
-        # Small delay for UI to settle
-        time.sleep(1.0)
-
-        return self._get_observation()
-
-    def step(
-        self, action: BenchmarkAction
-    ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
-        """Execute action and return new observation.
-
-        Uses element-based grounding via WAA's Computer class. Click actions
-        are translated to computer.mouse.move_id(id) commands that WAA executes
-        using the rects we POSTed to /update_computer.
-
-        Args:
-            action: Action to execute.
-
-        Returns:
-            Tuple of (observation, done, info).
-        """
-        self._step_count += 1
-        self._actions.append(action)
-
-        # Translate action to element-based command for WAA's Computer
-        command = self._translate_action(action)
-
-        # Execute command via /execute_windows (has access to computer object)
-        if command:
-            try:
-                resp = requests.post(
-                    f"{self.config.server_url}/execute_windows",
-                    json={"command": command},
-                    timeout=self.config.timeout,
-                )
-                if resp.status_code != 200:
-                    logger.error(f"Execute failed ({resp.status_code}): {resp.text}")
-                else:
-                    result = resp.json()
-                    if result.get("stderr"):
-                        logger.warning(f"Command stderr: {result['stderr']}")
-                    logger.debug(f"Executed: {command}")
-            except requests.RequestException as e:
-                logger.error(f"Execute request failed: {e}")
-
-        # Wait for UI to settle
-        time.sleep(self.config.action_delay)
-
-        # Check if done
-        done = action.type == "done" or self._step_count >= self.config.max_steps
-
-        obs = self._get_observation()
-        info = {
-            "step": self._step_count,
-            "command": command,
-        }
-
-        return obs, done, info
-
-    def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Evaluate current state against task success criteria.
-
-        For live adapter, full evaluation requires running WAA's evaluators.
-        Currently returns a placeholder result.
-
-        Args:
-            task: Task to evaluate.
-
-        Returns:
-            BenchmarkResult with success/score.
-        """
-        # TODO: Implement proper evaluation by calling WAA evaluators
-        # For now, check if agent took any actions
-        has_actions = len(self._actions) > 0
-        called_done = any(a.type == "done" for a in self._actions)
-
-        return BenchmarkResult(
-            task_id=task.task_id,
-            success=False,  # Can't determine without evaluator
-            score=0.5 if has_actions and called_done else 0.0,
-            num_steps=self._step_count,
-            reason="Evaluation requires WAA evaluators (not yet implemented)",
-        )
-
-    def close(self) -> None:
-        """Clean up resources."""
-        self._current_task = None
-        self._current_a11y = None
-        self._actions = []
-
-    def _get_observation(self) -> BenchmarkObservation:
-        """Fetch current observation from WAA server.
-
-        Also extracts element rects from a11y tree and updates WAA's Computer
-        so element-based grounding works for subsequent actions.
-
-        Returns:
-            BenchmarkObservation with screenshot and accessibility tree.
-        """
-        screenshot = None
-        a11y_tree = None
-
-        # Get screenshot
-        try:
-            resp = requests.get(f"{self.config.server_url}/screenshot", timeout=30.0)
-            if resp.status_code == 200:
-                screenshot = resp.content
-                self._current_screenshot = screenshot
-                logger.debug(f"Got screenshot: {len(screenshot)} bytes")
-            else:
-                logger.warning(f"Screenshot request failed: {resp.status_code}")
-        except requests.RequestException as e:
-            logger.error(f"Screenshot request error: {e}")
-
-        # Get accessibility tree
-        try:
-            resp = requests.get(
-                f"{self.config.server_url}/accessibility",
-                params={"backend": self.config.a11y_backend},
-                timeout=30.0,
-            )
-            if resp.status_code == 200:
-                result = resp.json()
-                a11y_tree = result.get("AT", {})
-                self._current_a11y = a11y_tree
-                # Extract rects for element-based grounding
-                self._current_rects = self._extract_rects_from_a11y(a11y_tree)
-                logger.debug(
-                    "Got accessibility tree with %d elements", len(self._current_rects)
-                )
-            else:
-                logger.warning(f"A11y request failed: {resp.status_code}")
-        except requests.RequestException as e:
-            logger.error(f"A11y request error: {e}")
-
-        # Update WAA's Computer with current rects for element grounding
-        if self._current_rects:
-            self._update_waa_computer()
-
-        return BenchmarkObservation(
-            screenshot=screenshot,
-            viewport=(self.config.screen_width, self.config.screen_height),
-            accessibility_tree=a11y_tree,
-            window_title=self._extract_window_title(a11y_tree),
-        )
-
-    def _extract_window_title(self, a11y_tree: dict | str | None) -> str | None:
-        """Extract window title from accessibility tree."""
-        if not a11y_tree:
-            return None
-        # Handle XML string - can't extract title easily
-        if isinstance(a11y_tree, str):
-            return None
-        # Try common field names
-        for key in ["Name", "name", "title", "Title"]:
-            if key in a11y_tree:
-                return a11y_tree[key]
-        return None
-
-    def _extract_rects_from_a11y(self, a11y_tree: dict | None) -> dict[str, list[int]]:
-        """Extract element ID -> bounding box mapping from accessibility tree.
-
-        This produces the `rects` dict that WAA's Computer class expects.
-        The rects are then POSTed to /update_computer so WAA can handle grounding.
-
-        Args:
-            a11y_tree: Accessibility tree from /accessibility endpoint.
-
-        Returns:
-            Dict mapping element IDs to [left, top, right, bottom] bounding boxes.
-        """
-        rects: dict[str, list[int]] = {}
-
-        def visit(node: dict) -> None:
-            # Get element ID
-            elem_id = None
-            for id_field in ["id", "Id", "ID", "AutomationId"]:
-                if id_field in node and node[id_field]:
-                    elem_id = str(node[id_field])
-                    break
-
-            # Get bounding box
-            bbox = None
-            for bbox_field in ["bbox", "BoundingRectangle", "Rect", "rect"]:
-                if bbox_field in node:
-                    bbox = node[bbox_field]
-                    break
-
-            # Store if we have both ID and bbox
-            if elem_id is not None and bbox is not None:
-                # Normalize bbox to [left, top, right, bottom]
-                if isinstance(bbox, list) and len(bbox) == 4:
-                    # Could be [l, t, r, b] or [l, t, w, h] - assume [l, t, r, b]
-                    rects[elem_id] = [int(x) for x in bbox]
-                elif isinstance(bbox, dict):
-                    x = bbox.get("x", 0)
-                    y = bbox.get("y", 0)
-                    w = bbox.get("width", 0)
-                    h = bbox.get("height", 0)
-                    rects[elem_id] = [x, y, x + w, y + h]
-                elif isinstance(bbox, str):
-                    parts = [int(p) for p in bbox.split(",")]
-                    if len(parts) == 4:
-                        rects[elem_id] = parts
-
-            # Visit children
-            for child_field in ["children", "Children"]:
-                children = node.get(child_field, [])
-                if isinstance(children, list):
-                    for child in children:
-                        if isinstance(child, dict):
-                            visit(child)
-
-        if a11y_tree:
-            # Handle case where a11y_tree is XML string (WAA returns XML)
-            if isinstance(a11y_tree, str):
-                # TODO: Parse XML to dict if needed for element grounding
-                logger.debug("A11y tree is XML string, skipping rect extraction")
-                return rects
-            visit(a11y_tree)
-
-        logger.debug(f"Extracted {len(rects)} element rects from a11y tree")
-        return rects
-
-    def _update_waa_computer(self) -> None:
-        """POST current rects and screenshot to WAA's /update_computer endpoint.
-
-        This syncs WAA's Computer object with our current element state,
-        allowing computer.mouse.move_id(id) to work correctly.
-        """
-        if not self._current_rects:
-            logger.warning("No rects to update - skipping /update_computer")
-            return
-
-        # Encode screenshot as base64
-        screenshot_b64 = ""
-        if self._current_screenshot:
-            screenshot_b64 = base64.b64encode(self._current_screenshot).decode("utf-8")
-
-        # Window rect (full screen for now)
-        window_rect = [0, 0, self.config.screen_width, self.config.screen_height]
-
-        payload = {
-            "rects": self._current_rects,
-            "window_rect": window_rect,
-            "screenshot": screenshot_b64,
-            "scale": [1.0, 1.0],
-        }
-
-        try:
-            resp = requests.post(
-                f"{self.config.server_url}/update_computer", json=payload, timeout=30.0
-            )
-            if resp.status_code == 200:
-                logger.debug(
-                    "Updated WAA computer with %d rects", len(self._current_rects)
-                )
-            else:
-                logger.warning(
-                    f"update_computer failed: {resp.status_code} - {resp.text}"
-                )
-        except requests.RequestException as e:
-            logger.error(f"update_computer request error: {e}")
-
-    def _run_task_setup(self, raw_config: dict) -> None:
-        """Run task setup commands from raw_config.
-
-        Args:
-            raw_config: Task configuration with setup commands.
-        """
-        # Handle different setup command formats
-        setup = raw_config.get("setup", raw_config.get("init", {}))
-
-        if isinstance(setup, dict):
-            # Launch application if specified
-            if "app" in setup or "application" in setup:
-                app = setup.get("app") or setup.get("application")
-                try:
-                    requests.post(
-                        f"{self.config.server_url}/setup/launch",
-                        json={"app": app},
-                        timeout=30.0,
-                    )
-                    logger.info(f"Launched app: {app}")
-                except requests.RequestException as e:
-                    logger.warning(f"Failed to launch app: {e}")
-
-            # Run shell commands if specified
-            if "commands" in setup:
-                for cmd in setup["commands"]:
-                    try:
-                        requests.post(
-                            f"{self.config.server_url}/execute_windows",
-                            json={"command": cmd, "shell": "powershell"},
-                            timeout=60.0,
-                        )
-                        logger.info(f"Ran setup command: {cmd[:50]}...")
-                    except requests.RequestException as e:
-                        logger.warning(f"Setup command failed: {e}")
-
-    def _translate_action(self, action: BenchmarkAction) -> str | None:
-        """Translate BenchmarkAction to element-based command for WAA's Computer.
-
-        Uses WAA's Computer class via /execute_windows endpoint. Click actions
-        use computer.mouse.move_id(id) for element-based grounding - the actual
-        coordinates are resolved by WAA's Computer class using the rects we
-        POSTed to /update_computer.
-
-        Args:
-            action: The action to translate.
-
-        Returns:
-            Python command string to execute via /execute_windows endpoint,
-            or None for actions that don't need execution.
-        """
-        if action.type == "done":
-            return None
-
-        if action.type == "wait":
-            return "import time; time.sleep(1)"
-
-        if action.type == "click":
-            return self._translate_click_action(action, "single_click")
-
-        if action.type == "double_click":
-            return self._translate_click_action(action, "double_click")
-
-        if action.type == "right_click":
-            return self._translate_click_action(action, "right_click")
-
-        if action.type == "type":
-            text = action.text or ""
-            # Escape special characters
-            text = text.replace("\\", "\\\\").replace("'", "\\'")
-            # Use pyautogui for typing (no grounding needed)
-            return f"import pyautogui; pyautogui.write('{text}', interval=0.02)"
-
-        if action.type == "key":
-            return self._translate_key_action(action)
-
-        if action.type == "scroll":
-            direction = action.scroll_direction or "down"
-            return f"computer.mouse.scroll('{direction}')"
-
-        if action.type == "drag":
-            # Drag requires start and end - use element IDs or coordinates
-            if action.target_node_id is not None:
-                elem_id = str(action.target_node_id)
-                if elem_id in self._current_rects:
-                    # Start at element, drag to end coords
-                    end_x = action.end_x or 0
-                    end_y = action.end_y or 0
-                    if isinstance(end_x, float) and 0 <= end_x <= 1:
-                        end_x = int(end_x * self.config.screen_width)
-                    if isinstance(end_y, float) and 0 <= end_y <= 1:
-                        end_y = int(end_y * self.config.screen_height)
-                    return (
-                        f"computer.mouse.move_id('{elem_id}'); "
-                        f"computer.mouse.drag({int(end_x)}, {int(end_y)})"
-                    )
-            logger.warning("Drag requires target_node_id with valid element")
-            return None
-
-        logger.warning(f"Unknown action type: {action.type}")
-        return None
-
-    def _translate_click_action(
-        self, action: BenchmarkAction, click_method: str
-    ) -> str:
-        """Translate click-type action to element-based command.
-
-        Args:
-            action: The click action.
-            click_method: "single_click", "double_click", or "right_click".
-
-        Returns:
-            Python command string using computer.mouse.move_id() for grounding.
-        """
-        # Prefer element ID for grounding (SoM mode)
-        if action.target_node_id is not None:
-            elem_id = str(action.target_node_id)
-            if elem_id in self._current_rects:
-                return f"computer.mouse.move_id('{elem_id}'); computer.mouse.{click_method}()"
-            else:
-                logger.warning(
-                    f"Element ID '{elem_id}' not found in rects, falling back to coordinates"
-                )
-
-        # Fallback: use coordinates if provided (less precise)
-        x = action.x if action.x is not None else 0
-        y = action.y if action.y is not None else 0
-
-        # Normalize coordinates
-        if isinstance(x, float) and 0 <= x <= 1:
-            x = x  # Keep normalized - move_abs handles it
-        if isinstance(y, float) and 0 <= y <= 1:
-            y = y  # Keep normalized
-
-        return f"computer.mouse.move_abs({x}, {y}); computer.mouse.{click_method}()"
-
-    def _translate_key_action(self, action: BenchmarkAction) -> str:
-        """Translate key press action using pyautogui (no grounding needed)."""
-        key = action.key or ""
-
-        # Map common key names to pyautogui names
-        key_map = {
-            "Enter": "enter",
-            "Return": "enter",
-            "Tab": "tab",
-            "Escape": "escape",
-            "Esc": "escape",
-            "Backspace": "backspace",
-            "Delete": "delete",
-            "Del": "delete",
-            "Space": "space",
-            "Up": "up",
-            "Down": "down",
-            "Left": "left",
-            "Right": "right",
-            "Home": "home",
-            "End": "end",
-            "PageUp": "pageup",
-            "PageDown": "pagedown",
-            "F1": "f1",
-            "F2": "f2",
-            "F3": "f3",
-            "F4": "f4",
-            "F5": "f5",
-            "F6": "f6",
-            "F7": "f7",
-            "F8": "f8",
-            "F9": "f9",
-            "F10": "f10",
-            "F11": "f11",
-            "F12": "f12",
-        }
-        key = key_map.get(key, key.lower())
-
-        # Handle modifiers with hotkey
-        if action.modifiers:
-            mods = [m.lower() for m in action.modifiers]
-            mod_map = {"control": "ctrl", "command": "win", "meta": "win"}
-            mods = [mod_map.get(m, m) for m in mods]
-            all_keys = mods + [key]
-            keys_str = ", ".join(f"'{k}'" for k in all_keys)
-            return f"import pyautogui; pyautogui.hotkey({keys_str})"
+# Re-export from canonical location
+from openadapt_evals.adapters.waa import (
+    WAALiveAdapter,
+    WAALiveConfig,
+)
 
-        return f"import pyautogui; pyautogui.press('{key}')"
+__all__ = [
+    "WAALiveAdapter",
+    "WAALiveConfig",
+]
diff --git a/tests/benchmarks/test_waa.py b/tests/benchmarks/test_waa.py
index cc1418c..3a21248 100644
--- a/tests/benchmarks/test_waa.py
+++ b/tests/benchmarks/test_waa.py
@@ -35,8 +35,9 @@ def test_list_tasks_by_domain(self):
     def test_load_task(self):
         """Test loading a specific task."""
         adapter = WAAMockAdapter(num_tasks=5, domains=["browser"])
-        task = adapter.load_task("browser_1")
-        assert task.task_id == "browser_1"
+        # Mock adapter uses "mock_{domain}_{number:03d}" format
+        task = adapter.load_task("mock_browser_001")
+        assert task.task_id == "mock_browser_001"
         assert task.domain == "browser"
 
     def test_load_task_not_found(self):
@@ -124,16 +125,17 @@ def test_evaluate_with_random_agent(self):
 
     def test_evaluate_specific_tasks(self):
         """Test evaluating specific tasks."""
-        adapter = WAAMockAdapter(num_tasks=10)
+        adapter = WAAMockAdapter(num_tasks=10, domains=["browser", "notepad"])
         agent = RandomAgent(seed=42)
 
+        # Mock adapter uses "mock_{domain}_{number:03d}" format
         results = evaluate_agent_on_benchmark(
-            agent, adapter, task_ids=["browser_1", "browser_2"], max_steps=10
+            agent, adapter, task_ids=["mock_browser_001", "mock_browser_002"], max_steps=10
         )
 
         assert len(results) == 2
-        assert results[0].task_id == "browser_1"
-        assert results[1].task_id == "browser_2"
+        assert results[0].task_id == "mock_browser_001"
+        assert results[1].task_id == "mock_browser_002"
 
     def test_evaluate_with_scripted_agent(self):
         """Test running evaluation with ScriptedAgent."""
diff --git a/tests/integration/test_data_collection.py b/tests/integration/test_data_collection.py
index abd2412..c8a1acc 100644
--- a/tests/integration/test_data_collection.py
+++ b/tests/integration/test_data_collection.py
@@ -30,9 +30,13 @@
 import logging
 from pathlib import Path
 
-from openadapt_ml.benchmarks.agent import RandomAgent
-from openadapt_ml.benchmarks.runner import EvaluationConfig, evaluate_agent_on_benchmark
-from openadapt_ml.benchmarks.waa import WAAMockAdapter
+# Import from the main benchmarks module (re-exports from openadapt-evals)
+from openadapt_ml.benchmarks import (
+    EvaluationConfig,
+    RandomAgent,
+    WAAMockAdapter,
+    evaluate_agent_on_benchmark,
+)
 
 # Set up logging
 logging.basicConfig(
diff --git a/tests/test_waa_live.py b/tests/test_waa_live.py
deleted file mode 100644
index 73786c7..0000000
--- a/tests/test_waa_live.py
+++ /dev/null
@@ -1,314 +0,0 @@
-"""Tests for WAALiveAdapter."""
-
-import pytest
-from unittest.mock import Mock, patch, MagicMock
-import json
-
-from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
-from openadapt_ml.benchmarks.base import BenchmarkAction, BenchmarkTask
-
-
-class TestWAALiveConfig:
-    """Tests for WAALiveConfig."""
-
-    def test_default_config(self):
-        """Test default configuration values."""
-        config = WAALiveConfig()
-        assert config.server_url == "http://localhost:5000"
-        assert config.a11y_backend == "uia"
-        assert config.screen_width == 1920
-        assert config.screen_height == 1200
-        assert config.max_steps == 15
-        assert config.action_delay == 0.5
-        assert config.timeout == 90.0
-
-    def test_custom_config(self):
-        """Test custom configuration."""
-        config = WAALiveConfig(
-            server_url="http://192.168.1.100:5000",
-            max_steps=20,
-            action_delay=1.0,
-        )
-        assert config.server_url == "http://192.168.1.100:5000"
-        assert config.max_steps == 20
-        assert config.action_delay == 1.0
-
-
-class TestWAALiveAdapter:
-    """Tests for WAALiveAdapter."""
-
-    def test_adapter_properties(self):
-        """Test adapter properties."""
-        adapter = WAALiveAdapter()
-        assert adapter.name == "waa-live"
-        assert adapter.benchmark_type == "interactive"
-        assert adapter.supports_parallel is False
-
-    @patch("openadapt_ml.benchmarks.waa_live.requests")
-    def test_check_connection_success(self, mock_requests):
-        """Test successful connection check."""
-        mock_requests.get.return_value = Mock(status_code=200)
-
-        adapter = WAALiveAdapter()
-        assert adapter.check_connection() is True
-
-        mock_requests.get.assert_called_once()
-
-    @patch("openadapt_ml.benchmarks.waa_live.requests.get")
-    def test_check_connection_failure(self, mock_get):
-        """Test failed connection check."""
-        import requests
-        mock_get.side_effect = requests.RequestException("Connection refused")
-
-        adapter = WAALiveAdapter()
-        assert adapter.check_connection() is False
-
-
-class TestActionTranslation:
-    """Tests for action translation.
-
-    The adapter uses element-based grounding via WAA's Computer class:
-    - Click actions use computer.mouse.move_id(id) for element grounding
-    - Keyboard actions use pyautogui (no grounding needed)
-    - Scroll uses computer.mouse.scroll()
-    """
-
-    def test_click_with_element_id(self):
-        """Test click with element ID uses move_id for grounding."""
-        adapter = WAALiveAdapter()
-        adapter._current_rects = {"5": [100, 200, 300, 400]}
-
-        action = BenchmarkAction(type="click", target_node_id="5")
-        command = adapter._translate_action(action)
-
-        assert "computer.mouse.move_id('5')" in command
-        assert "computer.mouse.single_click()" in command
-
-    def test_click_fallback_to_coords(self):
-        """Test click falls back to move_abs when no element ID."""
-        adapter = WAALiveAdapter()
-        adapter._current_rects = {}
-
-        action = BenchmarkAction(type="click", x=500, y=300)
-        command = adapter._translate_action(action)
-
-        assert "computer.mouse.move_abs(500, 300)" in command
-        assert "computer.mouse.single_click()" in command
-
-    def test_click_normalized_coords_fallback(self):
-        """Test click with normalized coordinates falls back to move_abs."""
-        adapter = WAALiveAdapter(WAALiveConfig(screen_width=1920, screen_height=1080))
-        adapter._current_rects = {}
-
-        action = BenchmarkAction(type="click", x=0.5, y=0.5)
-        command = adapter._translate_action(action)
-
-        # Normalized coords passed to move_abs (WAA handles conversion)
-        assert "computer.mouse.move_abs(0.5, 0.5)" in command
-        assert "computer.mouse.single_click()" in command
-
-    def test_double_click_with_element_id(self):
-        """Test double click with element ID."""
-        adapter = WAALiveAdapter()
-        adapter._current_rects = {"7": [0, 0, 100, 50]}
-
-        action = BenchmarkAction(type="double_click", target_node_id="7")
-        command = adapter._translate_action(action)
-
-        assert "computer.mouse.move_id('7')" in command
-        assert "computer.mouse.double_click()" in command
-
-    def test_type_action(self):
-        """Test type action uses pyautogui (no grounding needed)."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="type", text="Hello World")
-        command = adapter._translate_action(action)
-
-        assert "pyautogui.write('Hello World'" in command
-
-    def test_type_action_with_quotes(self):
-        """Test type action with quotes escaped."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="type", text="It's a \"test\"")
-        command = adapter._translate_action(action)
-
-        # Should escape single quotes
-        assert "\\'" in command
-
-    def test_key_action(self):
-        """Test key action uses pyautogui (no grounding needed)."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="key", key="Enter")
-        command = adapter._translate_action(action)
-
-        assert "pyautogui.press('enter')" in command
-
-    def test_key_action_with_modifiers(self):
-        """Test key action with modifiers."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="key", key="c", modifiers=["Control"])
-        command = adapter._translate_action(action)
-
-        assert "pyautogui.hotkey('ctrl', 'c')" in command
-
-    def test_scroll_action_down(self):
-        """Test scroll down uses computer.mouse.scroll."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="scroll", scroll_direction="down", scroll_amount=5)
-        command = adapter._translate_action(action)
-
-        assert "computer.mouse.scroll('down')" in command
-
-    def test_scroll_action_up(self):
-        """Test scroll up uses computer.mouse.scroll."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="scroll", scroll_direction="up", scroll_amount=3)
-        command = adapter._translate_action(action)
-
-        assert "computer.mouse.scroll('up')" in command
-
-    def test_done_action(self):
-        """Test done action returns None."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="done")
-        command = adapter._translate_action(action)
-
-        assert command is None
-
-    def test_wait_action(self):
-        """Test wait action."""
-        adapter = WAALiveAdapter()
-
-        action = BenchmarkAction(type="wait")
-        command = adapter._translate_action(action)
-
-        assert "time.sleep(1)" in command
-
-
-class TestRectExtraction:
-    """Tests for extracting element rects from a11y tree.
-
-    The adapter extracts element IDs and bboxes from the a11y tree
-    and sends them to WAA via /update_computer. WAA then handles
-    the actual grounding when computer.mouse.move_id(id) is called.
-    """
-
-    def test_extract_rects_simple(self):
-        """Test extracting rects from simple a11y tree."""
-        adapter = WAALiveAdapter()
-        a11y_tree = {
-            "id": "root",
-            "children": [
-                {
-                    "id": "5",
-                    "bbox": [100, 200, 300, 400],
-                }
-            ]
-        }
-
-        rects = adapter._extract_rects_from_a11y(a11y_tree)
-
-        assert "5" in rects
-        assert rects["5"] == [100, 200, 300, 400]
-
-    def test_extract_rects_nested(self):
-        """Test extracting rects from nested a11y tree."""
-        adapter = WAALiveAdapter()
-        a11y_tree = {
-            "id": "root",
-            "children": [
-                {
-                    "id": "1",
-                    "bbox": [0, 0, 500, 500],
-                    "children": [
-                        {
-                            "id": "3",
-                            "bbox": [50, 50, 150, 100],
-                        }
-                    ]
-                }
-            ]
-        }
-
-        rects = adapter._extract_rects_from_a11y(a11y_tree)
-
-        assert "root" in rects or "1" in rects  # Depends on if root has bbox
-        assert "1" in rects
-        assert "3" in rects
-        assert rects["3"] == [50, 50, 150, 100]
-
-    def test_extract_rects_empty_tree(self):
-        """Test extracting rects from empty a11y tree."""
-        adapter = WAALiveAdapter()
-
-        rects = adapter._extract_rects_from_a11y(None)
-        assert rects == {}
-
-        rects = adapter._extract_rects_from_a11y({})
-        assert rects == {}
-
-    def test_extract_rects_no_bbox(self):
-        """Test elements without bbox are skipped."""
-        adapter = WAALiveAdapter()
-        a11y_tree = {
-            "id": "root",
-            "children": [{"id": "5", "name": "Button"}]  # No bbox
-        }
-
-        rects = adapter._extract_rects_from_a11y(a11y_tree)
-
-        # Element without bbox should not be in rects
-        assert "5" not in rects
-
-    def test_click_element_not_in_rects_warns(self):
-        """Test click with unknown element ID logs warning and uses coords."""
-        adapter = WAALiveAdapter()
-        adapter._current_rects = {"1": [0, 0, 100, 100]}  # Element 7 not here
-
-        action = BenchmarkAction(
-            type="click",
-            target_node_id="7",  # Not in rects
-            x=999, y=999
-        )
-        command = adapter._translate_action(action)
-
-        # Should fall back to coordinate-based click
-        assert "move_abs" in command
-        assert "999" in command
-
-
-class TestObservationFetching:
-    """Tests for observation fetching."""
-
-    @patch("openadapt_ml.benchmarks.waa_live.requests")
-    def test_get_observation(self, mock_requests):
-        """Test fetching observation from server."""
-        # Mock screenshot response
-        screenshot_response = Mock()
-        screenshot_response.status_code = 200
-        screenshot_response.content = b"fake_png_data"
-
-        # Mock a11y response
-        a11y_response = Mock()
-        a11y_response.status_code = 200
-        a11y_response.json.return_value = {"AT": {"id": "root"}}
-
-        mock_requests.get.side_effect = [screenshot_response, a11y_response]
-
-        adapter = WAALiveAdapter()
-        obs = adapter._get_observation()
-
-        assert obs.screenshot == b"fake_png_data"
-        assert obs.accessibility_tree == {"id": "root"}
-        assert obs.viewport == (1920, 1200)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])

From fdebe76b13807de527311287346e3e8fbff2ee35 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 13:26:55 -0500
Subject: [PATCH 05/21] refactor(benchmarks): delete deprecation stubs, import
 from openadapt-evals

Remove deprecation stubs since there are no external users. Tests now
import directly from openadapt-evals (canonical location).

Deleted:
- base.py, waa.py, waa_live.py, runner.py, data_collection.py, live_tracker.py

Kept:
- agent.py (ML-specific agents: PolicyAgent, APIBenchmarkAgent, UnifiedBaselineAgent)
- __init__.py (simplified to only export ML-specific agents)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 openadapt_ml/benchmarks/__init__.py        | 114 ++-------------------
 openadapt_ml/benchmarks/base.py            |  36 -------
 openadapt_ml/benchmarks/data_collection.py |  26 -----
 openadapt_ml/benchmarks/live_tracker.py    |  24 -----
 openadapt_ml/benchmarks/runner.py          |  32 ------
 openadapt_ml/benchmarks/waa.py             |  30 ------
 openadapt_ml/benchmarks/waa_live.py        |  26 -----
 tests/benchmarks/test_api_agent.py         |   4 +-
 tests/benchmarks/test_waa.py               |   2 +-
 tests/integration/test_data_collection.py  |   4 +-
 tests/integration/test_live_eval.py        |   8 +-
 uv.lock                                    |  18 +++-
 12 files changed, 36 insertions(+), 288 deletions(-)
 delete mode 100644 openadapt_ml/benchmarks/base.py
 delete mode 100644 openadapt_ml/benchmarks/data_collection.py
 delete mode 100644 openadapt_ml/benchmarks/live_tracker.py
 delete mode 100644 openadapt_ml/benchmarks/runner.py
 delete mode 100644 openadapt_ml/benchmarks/waa.py
 delete mode 100644 openadapt_ml/benchmarks/waa_live.py

diff --git a/openadapt_ml/benchmarks/__init__.py b/openadapt_ml/benchmarks/__init__.py
index 3c92000..138df99 100644
--- a/openadapt_ml/benchmarks/__init__.py
+++ b/openadapt_ml/benchmarks/__init__.py
@@ -1,17 +1,15 @@
 """Benchmark integration for openadapt-ml.
 
-This module provides benchmark evaluation capabilities by re-exporting from
-`openadapt-evals` (the canonical benchmark package) plus ML-specific agents
-that wrap openadapt-ml internals.
+This module provides ML-specific agents for benchmark evaluation.
+These agents wrap openadapt-ml internals (trained policies, API adapters).
 
-For standalone benchmark evaluation (no ML training), use openadapt-evals:
+For benchmark infrastructure (adapters, runners, viewers), use openadapt-evals:
     ```python
-    from openadapt_evals import ApiAgent, WAAMockAdapter, evaluate_agent_on_benchmark
-    ```
-
-For ML-specific agents that use trained models:
-    ```python
-    from openadapt_ml.benchmarks import PolicyAgent, APIBenchmarkAgent
+    from openadapt_evals import (
+        WAAMockAdapter,
+        WAALiveAdapter,
+        evaluate_agent_on_benchmark,
+    )
     ```
 
 ML-specific agents (only available in openadapt-ml):
@@ -20,110 +18,14 @@
     - UnifiedBaselineAgent: Uses openadapt_ml.baselines adapters
 """
 
-import warnings
-
-# Emit deprecation warning for users still importing base classes from here
-warnings.warn(
-    "For standalone benchmark evaluation, prefer importing from openadapt_evals directly. "
-    "openadapt_ml.benchmarks re-exports from openadapt_evals for backward compatibility.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# ruff: noqa: E402
-# Imports after warning call are intentional
-
-# Re-export base classes from openadapt-evals (canonical location)
-from openadapt_evals import (
-    # Base classes
-    BenchmarkAction,
-    BenchmarkAdapter,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-    StaticDatasetAdapter,
-    UIElement,
-    # Base agent interface
-    BenchmarkAgent,
-    # Test/mock agents (no ML deps)
-    RandomAgent,
-    ScriptedAgent,
-    SmartMockAgent,
-    # Standalone API agent (P0 demo persistence fix)
-    ApiAgent,
-    # Evaluation utilities
-    EvaluationConfig,
-    compute_domain_metrics,
-    compute_metrics,
-    evaluate_agent_on_benchmark,
-    # WAA adapters
-    WAAAdapter,
-    WAAConfig,
-    WAAMockAdapter,
-    WAALiveAdapter,
-    WAALiveConfig,
-    # Viewer
-    generate_benchmark_viewer,
-    # Data collection
-    ExecutionTraceCollector,
-    LiveEvaluationTracker,
-    save_execution_trace,
-)
-
-# ML-specific agents (only available in openadapt-ml)
 from openadapt_ml.benchmarks.agent import (
     APIBenchmarkAgent,
     PolicyAgent,
     UnifiedBaselineAgent,
 )
 
-
-# Lazy import for Azure classes (avoids requiring azure-ai-ml for basic usage)
-def __getattr__(name: str):
-    if name in ("AzureConfig", "AzureWAAOrchestrator", "estimate_cost"):
-        from openadapt_evals.benchmarks import azure
-        return getattr(azure, name)
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-
-
 __all__ = [
-    # Base classes (from openadapt-evals)
-    "BenchmarkAdapter",
-    "BenchmarkTask",
-    "BenchmarkObservation",
-    "BenchmarkAction",
-    "BenchmarkResult",
-    "StaticDatasetAdapter",
-    "UIElement",
-    # Agents (from openadapt-evals)
-    "BenchmarkAgent",
-    "ScriptedAgent",
-    "RandomAgent",
-    "SmartMockAgent",
-    "ApiAgent",
-    # ML-specific agents (openadapt-ml only)
     "PolicyAgent",
     "APIBenchmarkAgent",
     "UnifiedBaselineAgent",
-    # Evaluation (from openadapt-evals)
-    "EvaluationConfig",
-    "evaluate_agent_on_benchmark",
-    "compute_metrics",
-    "compute_domain_metrics",
-    # WAA (from openadapt-evals)
-    "WAAAdapter",
-    "WAAConfig",
-    "WAAMockAdapter",
-    "WAALiveAdapter",
-    "WAALiveConfig",
-    # Viewer (from openadapt-evals)
-    "generate_benchmark_viewer",
-    # Data collection (from openadapt-evals)
-    "ExecutionTraceCollector",
-    "LiveEvaluationTracker",
-    "save_execution_trace",
-    # Azure (lazy-loaded from openadapt-evals)
-    "AzureConfig",
-    "AzureWAAOrchestrator",
-    "estimate_cost",
 ]
diff --git a/openadapt_ml/benchmarks/base.py b/openadapt_ml/benchmarks/base.py
deleted file mode 100644
index 569b6ed..0000000
--- a/openadapt_ml/benchmarks/base.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""DEPRECATED: Import from openadapt_evals instead.
-
-This module is kept for backward compatibility only.
-All classes are now provided by openadapt_evals.adapters.base.
-"""
-
-import warnings
-
-warnings.warn(
-    "openadapt_ml.benchmarks.base is deprecated. "
-    "Please import from openadapt_evals instead: "
-    "from openadapt_evals import BenchmarkAdapter, BenchmarkTask, BenchmarkAction",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# Re-export from canonical location
-from openadapt_evals.adapters.base import (
-    BenchmarkAction,
-    BenchmarkAdapter,
-    BenchmarkObservation,
-    BenchmarkResult,
-    BenchmarkTask,
-    StaticDatasetAdapter,
-    UIElement,
-)
-
-__all__ = [
-    "BenchmarkAction",
-    "BenchmarkAdapter",
-    "BenchmarkObservation",
-    "BenchmarkResult",
-    "BenchmarkTask",
-    "StaticDatasetAdapter",
-    "UIElement",
-]
diff --git a/openadapt_ml/benchmarks/data_collection.py b/openadapt_ml/benchmarks/data_collection.py
deleted file mode 100644
index f8e9e17..0000000
--- a/openadapt_ml/benchmarks/data_collection.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""DEPRECATED: Import from openadapt_evals instead.
-
-This module is kept for backward compatibility only.
-All classes are now provided by openadapt_evals.benchmarks.data_collection.
-"""
-
-import warnings
-
-warnings.warn(
-    "openadapt_ml.benchmarks.data_collection is deprecated. "
-    "Please import from openadapt_evals instead: "
-    "from openadapt_evals import ExecutionTraceCollector, save_execution_trace",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# Re-export from canonical location
-from openadapt_evals.benchmarks.data_collection import (
-    ExecutionTraceCollector,
-    save_execution_trace,
-)
-
-__all__ = [
-    "ExecutionTraceCollector",
-    "save_execution_trace",
-]
diff --git a/openadapt_ml/benchmarks/live_tracker.py b/openadapt_ml/benchmarks/live_tracker.py
deleted file mode 100644
index d719864..0000000
--- a/openadapt_ml/benchmarks/live_tracker.py
+++ /dev/null
@@ -1,24 +0,0 @@
-"""DEPRECATED: Import from openadapt_evals instead.
-
-This module is kept for backward compatibility only.
-All classes are now provided by openadapt_evals.benchmarks.live_tracker.
-"""
-
-import warnings
-
-warnings.warn(
-    "openadapt_ml.benchmarks.live_tracker is deprecated. "
-    "Please import from openadapt_evals instead: "
-    "from openadapt_evals import LiveEvaluationTracker",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# Re-export from canonical location
-from openadapt_evals.benchmarks.live_tracker import (
-    LiveEvaluationTracker,
-)
-
-__all__ = [
-    "LiveEvaluationTracker",
-]
diff --git a/openadapt_ml/benchmarks/runner.py b/openadapt_ml/benchmarks/runner.py
deleted file mode 100644
index b55c99d..0000000
--- a/openadapt_ml/benchmarks/runner.py
+++ /dev/null
@@ -1,32 +0,0 @@
-"""DEPRECATED: Import from openadapt_evals instead.
-
-This module is kept for backward compatibility only.
-All functions are now provided by openadapt_evals.benchmarks.runner.
-"""
-
-import warnings
-
-warnings.warn(
-    "openadapt_ml.benchmarks.runner is deprecated. "
-    "Please import from openadapt_evals instead: "
-    "from openadapt_evals import evaluate_agent_on_benchmark, compute_metrics",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# Re-export from canonical location
-from openadapt_evals.benchmarks.runner import (
-    EvaluationConfig,
-    evaluate_agent_on_benchmark,
-)
-from openadapt_evals.benchmarks import (
-    compute_domain_metrics,
-    compute_metrics,
-)
-
-__all__ = [
-    "EvaluationConfig",
-    "compute_domain_metrics",
-    "compute_metrics",
-    "evaluate_agent_on_benchmark",
-]
diff --git a/openadapt_ml/benchmarks/waa.py b/openadapt_ml/benchmarks/waa.py
deleted file mode 100644
index b42004f..0000000
--- a/openadapt_ml/benchmarks/waa.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""DEPRECATED: Import from openadapt_evals instead.
-
-This module is kept for backward compatibility only.
-All classes are now provided by openadapt_evals.adapters.waa.
-"""
-
-import warnings
-
-warnings.warn(
-    "openadapt_ml.benchmarks.waa is deprecated. "
-    "Please import from openadapt_evals instead: "
-    "from openadapt_evals import WAAAdapter, WAAMockAdapter, WAAConfig",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# Re-export from canonical location
-from openadapt_evals.adapters.waa import (
-    WAA_DOMAINS,
-    WAAAdapter,
-    WAAConfig,
-    WAAMockAdapter,
-)
-
-__all__ = [
-    "WAA_DOMAINS",
-    "WAAAdapter",
-    "WAAConfig",
-    "WAAMockAdapter",
-]
diff --git a/openadapt_ml/benchmarks/waa_live.py b/openadapt_ml/benchmarks/waa_live.py
deleted file mode 100644
index 70c7a3d..0000000
--- a/openadapt_ml/benchmarks/waa_live.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""DEPRECATED: Import from openadapt_evals instead.
-
-This module is kept for backward compatibility only.
-All classes are now provided by openadapt_evals.adapters.waa_live.
-"""
-
-import warnings
-
-warnings.warn(
-    "openadapt_ml.benchmarks.waa_live is deprecated. "
-    "Please import from openadapt_evals instead: "
-    "from openadapt_evals import WAALiveAdapter, WAALiveConfig",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# Re-export from canonical location
-from openadapt_evals.adapters.waa import (
-    WAALiveAdapter,
-    WAALiveConfig,
-)
-
-__all__ = [
-    "WAALiveAdapter",
-    "WAALiveConfig",
-]
diff --git a/tests/benchmarks/test_api_agent.py b/tests/benchmarks/test_api_agent.py
index d7732de..211d7bf 100644
--- a/tests/benchmarks/test_api_agent.py
+++ b/tests/benchmarks/test_api_agent.py
@@ -2,8 +2,8 @@
 
 import pytest
 
-from openadapt_ml.benchmarks import APIBenchmarkAgent, BenchmarkAction
-from openadapt_ml.benchmarks.base import BenchmarkObservation, BenchmarkTask
+from openadapt_evals import BenchmarkAction, BenchmarkObservation, BenchmarkTask
+from openadapt_ml.benchmarks import APIBenchmarkAgent
 
 
 class TestAPIBenchmarkAgentParsing:
diff --git a/tests/benchmarks/test_waa.py b/tests/benchmarks/test_waa.py
index 3a21248..9dc4cf9 100644
--- a/tests/benchmarks/test_waa.py
+++ b/tests/benchmarks/test_waa.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from openadapt_ml.benchmarks import (
+from openadapt_evals import (
     BenchmarkAction,
     BenchmarkObservation,
     BenchmarkResult,
diff --git a/tests/integration/test_data_collection.py b/tests/integration/test_data_collection.py
index c8a1acc..159e1f2 100644
--- a/tests/integration/test_data_collection.py
+++ b/tests/integration/test_data_collection.py
@@ -30,8 +30,8 @@
 import logging
 from pathlib import Path
 
-# Import from the main benchmarks module (re-exports from openadapt-evals)
-from openadapt_ml.benchmarks import (
+# Import from openadapt-evals (canonical benchmark package)
+from openadapt_evals import (
     EvaluationConfig,
     RandomAgent,
     WAAMockAdapter,
diff --git a/tests/integration/test_live_eval.py b/tests/integration/test_live_eval.py
index 448fc7a..68ac704 100644
--- a/tests/integration/test_live_eval.py
+++ b/tests/integration/test_live_eval.py
@@ -17,8 +17,12 @@
 import time
 from pathlib import Path
 
-from openadapt_ml.benchmarks import RandomAgent, WAAMockAdapter, evaluate_agent_on_benchmark
-from openadapt_ml.benchmarks.runner import EvaluationConfig
+from openadapt_evals import (
+    EvaluationConfig,
+    RandomAgent,
+    WAAMockAdapter,
+    evaluate_agent_on_benchmark,
+)
 
 
 def main():
diff --git a/uv.lock b/uv.lock
index 68193f3..9c77a77 100644
--- a/uv.lock
+++ b/uv.lock
@@ -996,7 +996,7 @@ name = "exceptiongroup"
 version = "1.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371 }
 wheels = [
@@ -2645,6 +2645,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c4/a8/f08d2fb482dc64443ae7208f9616b994b74b63f052737373f9fc32eb0ead/openadapt_capture-0.1.0-py3-none-any.whl", hash = "sha256:229e6d762dcfe22a34655853b5cf7c9eb08a61238cc79eefdffcf01f0c3dc860", size = 57921 },
 ]
 
+[[package]]
+name = "openadapt-evals"
+version = "0.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pillow" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/22/56/2e952caefce3755d82a0633b42ed9fbf4fbb8bfe45da5c693f1e2de50e7d/openadapt_evals-0.1.0.tar.gz", hash = "sha256:1015bc0fffba318e89f963bcf189979e6f8a4edf7a7b9f602339886be731ca6b", size = 256352 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/93/d76148490517ae02d417608f7906e2c34a63f68dcc1506c9387484112580/openadapt_evals-0.1.0-py3-none-any.whl", hash = "sha256:05a79c8598b41d90a5c5e33a114dc0aa9f0c81956b5d71f3ebd130d025d79dd0", size = 72408 },
+]
+
 [[package]]
 name = "openadapt-ml"
 version = "0.2.0"
@@ -2678,6 +2690,9 @@ azure = [
     { name = "azure-ai-ml" },
     { name = "azure-identity" },
 ]
+benchmarks = [
+    { name = "openadapt-evals" },
+]
 dev = [
     { name = "pytest" },
     { name = "ruff" },
@@ -2706,6 +2721,7 @@ requires-dist = [
     { name = "google-generativeai", specifier = ">=0.8.5" },
     { name = "matplotlib", specifier = ">=3.10.7" },
     { name = "openadapt-capture", specifier = ">=0.1.0" },
+    { name = "openadapt-evals", marker = "extra == 'benchmarks'", specifier = ">=0.1.0" },
     { name = "openai", marker = "extra == 'api'", specifier = ">=1.0.0" },
     { name = "peft", specifier = ">=0.18.0" },
     { name = "pillow", specifier = ">=12.0.0" },

From e6a09d27a7b59309b9099b97265b8ea8aa1fb6a9 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 13:27:31 -0500
Subject: [PATCH 06/21] docs(readme): add WAA benchmark results section with
 placeholders

Add section 15 for Windows Agent Arena benchmark results with clearly
marked placeholders. Results will be filled in when full evaluation
completes. Warning banner indicates PR should not merge until
placeholders are replaced.

Sections added:
- 15.1 Benchmark Overview
- 15.2 Baseline Reproduction (paper vs our run)
- 15.3 Model Comparison (GPT-4o, Claude, Qwen variants)
- 15.4 Domain Breakdown

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a3fed4b..5d3c43f 100644
--- a/README.md
+++ b/README.md
@@ -892,7 +892,55 @@ For deeper architectural details, see [`docs/design.md`](docs/design.md).
 
 ---
 
-## 15. Roadmap
+## 15. Windows Agent Arena (WAA) Benchmark Results
+
+> **⚠️ PLACEHOLDER**: The results below are placeholders. Actual benchmark results will be added once the full evaluation completes. Do not merge this PR until placeholders are replaced with real values.
+
+### 15.1 Benchmark Overview
+
+[Windows Agent Arena (WAA)](https://github.com/microsoft/WindowsAgentArena) is a benchmark of **154 tasks across 11 Windows application domains** (Notepad, Chrome, File Explorer, etc.). It evaluates GUI agents on realistic Windows automation tasks.
+
+### 15.2 Baseline Reproduction
+
+We run the full WAA benchmark using the same methodology as the original paper to establish baseline performance.
+
+**WAA Baseline Results (GPT-4o):**
+
+| Metric | Paper Reported | Our Reproduction | Status |
+|--------|----------------|------------------|--------|
+| Success Rate | ~19.5% | `[PLACEHOLDER]` | `[PENDING]` |
+| Tasks Evaluated | 154 | `[PLACEHOLDER]` | `[PENDING]` |
+| Avg Steps/Task | N/A | `[PLACEHOLDER]` | `[PENDING]` |
+| Avg Time/Task | N/A | `[PLACEHOLDER]` | `[PENDING]` |
+
+### 15.3 Model Comparison
+
+Performance of different agents on WAA:
+
+| Agent | Success Rate | Avg Steps | Notes |
+|-------|--------------|-----------|-------|
+| GPT-4o (baseline) | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
+| Claude Sonnet 4.5 | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
+| Qwen3-VL-2B (base) | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
+| Qwen3-VL-2B (FT) | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Fine-tuned on demos |
+
+### 15.4 Domain Breakdown
+
+Success rates by Windows application domain:
+
+| Domain | Tasks | Success Rate |
+|--------|-------|--------------|
+| Notepad | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| Chrome | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| File Explorer | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| Settings | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| ... | ... | ... |
+
+> **Note**: Full domain breakdown will be added when benchmark completes.
+
+---
+
+## 16. Roadmap
 
 For the up-to-date, prioritized roadmap (including concrete implementation
 targets and agent-executable acceptance criteria), see

From 23beca302379bfd3694d10cc4c656ef84b293bc4 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 13:31:42 -0500
Subject: [PATCH 07/21] docs(readme): move WAA benchmark results to
 openadapt-evals

WAA benchmark results belong in openadapt-evals (the benchmark
infrastructure package) rather than openadapt-ml (the training package).

See: https://github.com/OpenAdaptAI/openadapt-evals/pull/22

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md | 50 +-------------------------------------------------
 1 file changed, 1 insertion(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 5d3c43f..a3fed4b 100644
--- a/README.md
+++ b/README.md
@@ -892,55 +892,7 @@ For deeper architectural details, see [`docs/design.md`](docs/design.md).
 
 ---
 
-## 15. Windows Agent Arena (WAA) Benchmark Results
-
-> **⚠️ PLACEHOLDER**: The results below are placeholders. Actual benchmark results will be added once the full evaluation completes. Do not merge this PR until placeholders are replaced with real values.
-
-### 15.1 Benchmark Overview
-
-[Windows Agent Arena (WAA)](https://github.com/microsoft/WindowsAgentArena) is a benchmark of **154 tasks across 11 Windows application domains** (Notepad, Chrome, File Explorer, etc.). It evaluates GUI agents on realistic Windows automation tasks.
-
-### 15.2 Baseline Reproduction
-
-We run the full WAA benchmark using the same methodology as the original paper to establish baseline performance.
-
-**WAA Baseline Results (GPT-4o):**
-
-| Metric | Paper Reported | Our Reproduction | Status |
-|--------|----------------|------------------|--------|
-| Success Rate | ~19.5% | `[PLACEHOLDER]` | `[PENDING]` |
-| Tasks Evaluated | 154 | `[PLACEHOLDER]` | `[PENDING]` |
-| Avg Steps/Task | N/A | `[PLACEHOLDER]` | `[PENDING]` |
-| Avg Time/Task | N/A | `[PLACEHOLDER]` | `[PENDING]` |
-
-### 15.3 Model Comparison
-
-Performance of different agents on WAA:
-
-| Agent | Success Rate | Avg Steps | Notes |
-|-------|--------------|-----------|-------|
-| GPT-4o (baseline) | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
-| Claude Sonnet 4.5 | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
-| Qwen3-VL-2B (base) | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
-| Qwen3-VL-2B (FT) | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Fine-tuned on demos |
-
-### 15.4 Domain Breakdown
-
-Success rates by Windows application domain:
-
-| Domain | Tasks | Success Rate |
-|--------|-------|--------------|
-| Notepad | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
-| Chrome | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
-| File Explorer | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
-| Settings | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
-| ... | ... | ... |
-
-> **Note**: Full domain breakdown will be added when benchmark completes.
-
----
-
-## 16. Roadmap
+## 15. Roadmap
 
 For the up-to-date, prioritized roadmap (including concrete implementation
 targets and agent-executable acceptance criteria), see

From 90c5454e436acfc9b68076848b1f4bf409b904ec Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 17:57:02 -0500
Subject: [PATCH 08/21] feat(cli): add VNC auto-launch and --fast VM option

- Add setup_vnc_tunnel_and_browser() helper for automatic VNC access
- Add VM_SIZE_FAST constants with D8 series sizes
- Add VM_SIZE_FAST_FALLBACKS for automatic region/size retry
- Add --fast flag to create command for faster installations
- Add --fast flag to start command for more QEMU resources (6 cores, 16GB)
- Opens browser automatically after container starts

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 openadapt_ml/benchmarks/cli.py | 217 ++++++++++++++++++++++++++-------
 1 file changed, 174 insertions(+), 43 deletions(-)

diff --git a/openadapt_ml/benchmarks/cli.py b/openadapt_ml/benchmarks/cli.py
index d16f231..0a8e3bc 100644
--- a/openadapt_ml/benchmarks/cli.py
+++ b/openadapt_ml/benchmarks/cli.py
@@ -35,6 +35,7 @@
 import subprocess
 import sys
 import time
+import webbrowser
 from datetime import datetime
 from pathlib import Path
 from typing import Optional
@@ -43,7 +44,24 @@
 # Constants (single source of truth)
 # =============================================================================
 
-VM_SIZE = "Standard_D4ds_v4"
+# VM sizes with nested virtualization support
+# Standard: $0.19/hr, 4 vCPU, 16GB RAM - baseline
+# Fast: $0.38/hr, 8 vCPU, 32GB RAM - ~30% faster install, ~40% faster eval
+VM_SIZE_STANDARD = "Standard_D4ds_v4"
+VM_SIZE_FAST = "Standard_D8ds_v5"
+VM_SIZE = VM_SIZE_STANDARD  # Default, can be overridden by --fast flag
+
+# Fallback sizes for --fast mode (in order of preference)
+# D8ds_v5: First choice (v5 with local SSD)
+# D8s_v5: v5 without local SSD
+# D8ds_v4: v4 with local SSD
+# D8as_v5: AMD version
+VM_SIZE_FAST_FALLBACKS = [
+    ("Standard_D8ds_v5", 0.38),
+    ("Standard_D8s_v5", 0.36),
+    ("Standard_D8ds_v4", 0.38),
+    ("Standard_D8as_v5", 0.34),
+]
 VM_REGIONS = ["centralus", "eastus", "westus2", "eastus2"]
 VM_NAME = "waa-eval-vm"
 RESOURCE_GROUP = "openadapt-agents"
@@ -62,6 +80,36 @@
     "ConnectTimeout=10",
 ]
 
+
+def setup_vnc_tunnel_and_browser(ip: str) -> Optional[subprocess.Popen]:
+    """Set up SSH tunnel for VNC and open browser.
+
+    Returns the tunnel process on success, None on failure.
+    """
+    # Kill any existing tunnel on port 8006
+    subprocess.run(["pkill", "-f", "ssh.*8006:localhost:8006"], capture_output=True)
+
+    # Start SSH tunnel in background
+    tunnel_proc = subprocess.Popen(
+        ["ssh", *SSH_OPTS, "-N", "-L", "8006:localhost:8006", f"azureuser@{ip}"],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+
+    # Wait for tunnel to establish
+    time.sleep(2)
+
+    # Check if tunnel is running
+    if tunnel_proc.poll() is not None:
+        return None
+
+    # Open browser
+    vnc_url = "http://localhost:8006"
+    webbrowser.open(vnc_url)
+
+    return tunnel_proc
+
+
 # Dockerfile location (relative to this file)
 DOCKERFILE_PATH = Path(__file__).parent / "waa_deploy" / "Dockerfile"
 
@@ -287,7 +335,6 @@ def wait_for_ssh(ip: str, timeout: int = 120) -> bool:
 def cmd_create(args):
     """Create Azure VM with nested virtualization."""
     init_logging()
-    log("CREATE", f"Creating VM '{VM_NAME}' ({VM_SIZE})...")
 
     # Check if VM already exists
     ip = get_vm_ip()
@@ -296,49 +343,75 @@ def cmd_create(args):
         log("CREATE", "Use 'delete' first if you want to recreate")
         return 0
 
-    # Try regions until one works
+    # Determine which sizes to try
+    use_fast = getattr(args, "fast", False)
+    if use_fast:
+        # Try multiple fast sizes with fallbacks
+        sizes_to_try = VM_SIZE_FAST_FALLBACKS
+        log("CREATE", f"Creating VM '{VM_NAME}' with --fast (trying multiple D8 sizes)...")
+    else:
+        # Standard mode: single size
+        sizes_to_try = [(VM_SIZE_STANDARD, 0.19)]
+        log("CREATE", f"Creating VM '{VM_NAME}' ({VM_SIZE_STANDARD}, $0.19/hr)...")
+
+    # Try size+region combinations until one works
     vm_created = False
-    for region in VM_REGIONS:
-        log("CREATE", f"Trying {region}...", end=" ")
+    successful_size = None
+    successful_cost = None
 
-        result = subprocess.run(
-            [
-                "az",
-                "vm",
-                "create",
-                "--resource-group",
-                RESOURCE_GROUP,
-                "--name",
-                VM_NAME,
-                "--location",
-                region,
-                "--image",
-                "Ubuntu2204",
-                "--size",
-                VM_SIZE,
-                "--admin-username",
-                "azureuser",
-                "--generate-ssh-keys",
-                "--public-ip-sku",
-                "Standard",
-            ],
-            capture_output=True,
-            text=True,
-        )
+    for vm_size, cost_per_hour in sizes_to_try:
+        log("CREATE", f"Trying size {vm_size} (${cost_per_hour:.2f}/hr)...")
 
-        if result.returncode == 0:
-            vm_info = json.loads(result.stdout)
-            ip = vm_info.get("publicIpAddress", "")
-            log("CREATE", f"created ({ip})")
-            vm_created = True
+        for region in VM_REGIONS:
+            log("CREATE", f"  {region}...", end=" ")
+
+            result = subprocess.run(
+                [
+                    "az",
+                    "vm",
+                    "create",
+                    "--resource-group",
+                    RESOURCE_GROUP,
+                    "--name",
+                    VM_NAME,
+                    "--location",
+                    region,
+                    "--image",
+                    "Ubuntu2204",
+                    "--size",
+                    vm_size,
+                    "--admin-username",
+                    "azureuser",
+                    "--generate-ssh-keys",
+                    "--public-ip-sku",
+                    "Standard",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            if result.returncode == 0:
+                vm_info = json.loads(result.stdout)
+                ip = vm_info.get("publicIpAddress", "")
+                log("CREATE", f"created ({ip})")
+                vm_created = True
+                successful_size = vm_size
+                successful_cost = cost_per_hour
+                break
+            else:
+                log("CREATE", "unavailable")
+
+        if vm_created:
             break
-        else:
-            log("CREATE", "unavailable")
 
     if not vm_created:
-        log("CREATE", "ERROR: Could not create VM in any region")
+        log("CREATE", "ERROR: Could not create VM in any region with any size")
+        if use_fast:
+            log("CREATE", "Tried sizes: " + ", ".join(s[0] for s in sizes_to_try))
         return 1
 
+    log("CREATE", f"Successfully created {successful_size} (${successful_cost:.2f}/hr) in {region}")
+
     # Wait for SSH
     log("CREATE", "Waiting for SSH...")
     if not wait_for_ssh(ip):
@@ -628,7 +701,16 @@ def cmd_start(args):
     # - Downloads Windows 11 Enterprise if not present
     # - Boots QEMU VM
     # - Runs WAA server automatically via FirstLogonCommands
-    log("START", "Starting container with VERSION=11e...")
+    # QEMU resource allocation (--fast uses more resources on D8ds_v5)
+    if getattr(args, "fast", False):
+        ram_size = "16G"
+        cpu_cores = 6
+        log("START", "Starting container with VERSION=11e (FAST mode: 6 cores, 16GB RAM)...")
+    else:
+        ram_size = "8G"
+        cpu_cores = 4
+        log("START", "Starting container with VERSION=11e...")
+
     docker_cmd = f"""docker run -d \\
   --name winarena \\
   --device=/dev/kvm \\
@@ -638,8 +720,8 @@ def cmd_start(args):
   -p 7200:7200 \\
   -v /mnt/waa-storage:/storage \\
   -e VERSION=11e \\
-  -e RAM_SIZE=8G \\
-  -e CPU_CORES=4 \\
+  -e RAM_SIZE={ram_size} \\
+  -e CPU_CORES={cpu_cores} \\
   -e DISK_SIZE=64G \\
   {DOCKER_IMAGE}"""
 
@@ -650,8 +732,19 @@ def cmd_start(args):
 
     log("START", "Container started")
     log("START", "Windows will boot and install (15-20 min on first run)")
-    log("START", "Monitor via: uv run python -m openadapt_ml.benchmarks.cli_v2 logs")
-    log("START", f"VNC (via SSH tunnel): ssh -L 8006:localhost:8006 azureuser@{ip}")
+
+    # Auto-launch VNC unless --no-vnc specified
+    if not getattr(args, "no_vnc", False):
+        log("START", "Auto-launching VNC viewer...")
+        tunnel_proc = setup_vnc_tunnel_and_browser(ip)
+        if tunnel_proc:
+            log("START", f"VNC auto-launched at http://localhost:8006 (tunnel PID: {tunnel_proc.pid})")
+        else:
+            log("START", "WARNING: VNC tunnel failed to start")
+            log("START", f"Manual VNC: ssh -L 8006:localhost:8006 azureuser@{ip}")
+    else:
+        log("START", f"VNC (via SSH tunnel): ssh -L 8006:localhost:8006 azureuser@{ip}")
+
     return 0
 
 
@@ -1255,13 +1348,34 @@ def cmd_deallocate(args):
 
     if result.returncode == 0:
         log("DEALLOCATE", "VM deallocated (billing stopped)")
-        log("DEALLOCATE", "Use 'az vm start' to resume")
+        log("DEALLOCATE", "Use 'vm-start' to resume")
         return 0
     else:
         log("DEALLOCATE", f"ERROR: {result.stderr}")
         return 1
 
 
+def cmd_vm_start(args):
+    """Start a deallocated VM."""
+    init_logging()
+    log("VM-START", f"Starting VM '{VM_NAME}'...")
+
+    result = subprocess.run(
+        ["az", "vm", "start", "-g", RESOURCE_GROUP, "-n", VM_NAME],
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode == 0:
+        ip = get_vm_ip()
+        log("VM-START", f"VM started: {ip}")
+        log("VM-START", "Run 'build' then 'start' to launch WAA container")
+        return 0
+    else:
+        log("VM-START", f"ERROR: {result.stderr}")
+        return 1
+
+
 def cmd_exec(args):
     """Run command on VM host."""
     ip = get_vm_ip()
@@ -1630,6 +1744,11 @@ def main():
 
     # create
     p_create = subparsers.add_parser("create", help="Create Azure VM")
+    p_create.add_argument(
+        "--fast",
+        action="store_true",
+        help="Use larger VM (D8ds_v5, $0.38/hr) for ~30%% faster install, ~40%% faster eval",
+    )
     p_create.set_defaults(func=cmd_create)
 
     # delete
@@ -1651,6 +1770,14 @@ def main():
     p_start.add_argument(
         "--fresh", action="store_true", help="Clean storage for fresh Windows install"
     )
+    p_start.add_argument(
+        "--no-vnc", action="store_true", help="Don't auto-launch VNC viewer"
+    )
+    p_start.add_argument(
+        "--fast",
+        action="store_true",
+        help="Allocate more CPU/RAM to QEMU (use with D8ds_v5 VM)",
+    )
     p_start.set_defaults(func=cmd_start)
 
     # stop
@@ -1720,6 +1847,10 @@ def main():
     p_dealloc = subparsers.add_parser("deallocate", help="Stop VM (preserves disk)")
     p_dealloc.set_defaults(func=cmd_deallocate)
 
+    # vm-start
+    p_vmstart = subparsers.add_parser("vm-start", help="Start a deallocated VM")
+    p_vmstart.set_defaults(func=cmd_vm_start)
+
     # logs
     p_logs = subparsers.add_parser("logs", help="Show WAA status and logs")
     p_logs.add_argument(

From ab2414d79fb85e515118ebc53c493781a3782cab Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 17:57:07 -0500
Subject: [PATCH 09/21] docs: add WAA speedup options documentation

- Document --fast VM flag usage
- Explain parallelization options
- Detail golden image approach for future optimization

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 docs/waa_speedup_options.md | 94 +++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 docs/waa_speedup_options.md

diff --git a/docs/waa_speedup_options.md b/docs/waa_speedup_options.md
new file mode 100644
index 0000000..1ba5de1
--- /dev/null
+++ b/docs/waa_speedup_options.md
@@ -0,0 +1,94 @@
+# WAA Speedup Options
+
+## Summary Table
+
+| Option | Speedup | Cost Impact | Recommended |
+|--------|---------|-------------|-------------|
+| `--fast` flag | ~30% install, ~40% eval | +$0.19/hr | YES for dev |
+| Deallocated VM | Skip 25min install | ~$1.50/mo | YES for repeat runs |
+| Parallelization | 45x (154 tasks) | -78% cost | YES for large benchmarks |
+
+## Option 1: `--fast` Flag (Double Hardware)
+
+Use larger VM with more CPU/RAM allocated to QEMU.
+
+**Usage:**
+```bash
+# Create fast VM
+uv run python -m openadapt_ml.benchmarks.cli create --fast
+
+# Start with fast QEMU allocation
+uv run python -m openadapt_ml.benchmarks.cli start --fast
+```
+
+**Specs:**
+
+| Mode | VM Size | vCPU | RAM | QEMU Cores | QEMU RAM | Cost/hr |
+|------|---------|------|-----|------------|----------|---------|
+| Standard | D4ds_v4 | 4 | 16GB | 4 | 8GB | $0.19 |
+| Fast | D8ds_v5 | 8 | 32GB | 6 | 16GB | $0.38 |
+
+**Expected Speedups:**
+- Windows installation: ~30% faster (25min → ~18min)
+- Task evaluation: ~40% faster (navi agent ML inference benefits from more CPU)
+- Total benchmark (30 tasks): ~35% faster
+
+**When to use:**
+- Development/debugging when you don't want to wait
+- Time-sensitive evaluations
+- Cost difference is negligible (~$0.19/hr extra)
+
+## Option 2: Deallocated "Golden" VM
+
+Keep a VM deallocated after WAA is fully installed. Restart when needed.
+
+**How it works:**
+1. First run: Create VM, install WAA fully (~25 min)
+2. After use: `deallocate` (stops billing, keeps disk)
+3. Next time: `vm-start` → boots in ~2-3 min with WAA ready
+
+**Cost:**
+- Deallocated VM: $0 compute
+- Disk storage: ~$0.05/GB/month = ~$1.50/month for 30GB
+
+**Commands:**
+```bash
+# After first successful run
+uv run python -m openadapt_ml.benchmarks.cli deallocate
+
+# Next time
+uv run python -m openadapt_ml.benchmarks.cli vm-start
+uv run python -m openadapt_ml.benchmarks.cli start  # Container starts, Windows boots in 2-3 min
+```
+
+## Option 3: Parallelization (Best for Large Benchmarks)
+
+Run multiple VMs in parallel for large task sets.
+
+**Speedup for 154 tasks:**
+
+| Workers | Time | Cost | vs Single VM |
+|---------|------|------|--------------|
+| 1 (sequential) | ~15 hours | $2.88 | baseline |
+| 5 | ~3 hours | $1.14 | 5x faster, 60% cheaper |
+| 10 | ~1.5 hours | $0.63 | 10x faster, 78% cheaper |
+
+**Implementation:** See `docs/waa_parallelization_plan.md`
+
+## Quick Reference
+
+```bash
+# Standard mode (default)
+uv run python -m openadapt_ml.benchmarks.cli create
+uv run python -m openadapt_ml.benchmarks.cli build
+uv run python -m openadapt_ml.benchmarks.cli start
+
+# Fast mode (double hardware)
+uv run python -m openadapt_ml.benchmarks.cli create --fast
+uv run python -m openadapt_ml.benchmarks.cli build
+uv run python -m openadapt_ml.benchmarks.cli start --fast
+
+# Reuse deallocated VM
+uv run python -m openadapt_ml.benchmarks.cli vm-start
+uv run python -m openadapt_ml.benchmarks.cli start
+```

From 988d207f2f2e8d1f3b096e26e20b48034bcac08c Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 18:01:51 -0500
Subject: [PATCH 10/21] docs(readme): add benchmark execution logs section

- Add section 13.5 with log viewing commands
- Add benchmark run commands with examples
- Renumber screenshot capture tool section to 13.6

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a3fed4b..1814fdc 100644
--- a/README.md
+++ b/README.md
@@ -825,9 +825,42 @@ uv run python -m openadapt_ml.benchmarks.cli vm monitor --mock
 uv run python -m openadapt_ml.benchmarks.cli vm monitor --auto-shutdown-hours 2
 ```
 
+### 13.5 Benchmark Execution Logs
+
+View benchmark execution progress and logs:
+
+```bash
+# View WAA status and container logs
+uv run python -m openadapt_ml.benchmarks.cli logs
+
+# Check WAA server status (probe endpoint)
+uv run python -m openadapt_ml.benchmarks.cli probe
+
+# Check VM/Azure status
+uv run python -m openadapt_ml.benchmarks.cli status
+
+# Download benchmark results from VM
+uv run python -m openadapt_ml.benchmarks.cli download
+
+# Analyze downloaded results
+uv run python -m openadapt_ml.benchmarks.cli analyze
+```
+
+**Running benchmarks:**
+```bash
+# Run full benchmark (154 tasks)
+uv run python -m openadapt_ml.benchmarks.cli run --num-tasks 154
+
+# Run specific domain
+uv run python -m openadapt_ml.benchmarks.cli run --domain notepad --num-tasks 5
+
+# Run single task
+uv run python -m openadapt_ml.benchmarks.cli run --task notepad_1
+```
+
 For complete VM management commands and Azure setup instructions, see [`CLAUDE.md`](CLAUDE.md) and [`docs/azure_waa_setup.md`](docs/azure_waa_setup.md).
 
-### 13.5 Screenshot Capture Tool
+### 13.6 Screenshot Capture Tool
 
 Capture screenshots of dashboards and VMs for documentation and PR purposes:
 

From 0b6b206fd03ec8f4b0fc89a88bb40aa9209390da Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 18:05:08 -0500
Subject: [PATCH 11/21] docs(readme): clarify --run flag for benchmark
 execution logs

- Add logs --run command for viewing task progress
- Add logs --run -f for live streaming
- Add logs --run --tail N for last N lines

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1814fdc..e208c67 100644
--- a/README.md
+++ b/README.md
@@ -830,9 +830,18 @@ uv run python -m openadapt_ml.benchmarks.cli vm monitor --auto-shutdown-hours 2
 View benchmark execution progress and logs:
 
 ```bash
-# View WAA status and container logs
+# View WAA container status and Docker logs
 uv run python -m openadapt_ml.benchmarks.cli logs
 
+# View WAA benchmark execution logs (task progress, agent actions)
+uv run python -m openadapt_ml.benchmarks.cli logs --run
+
+# Stream execution logs live
+uv run python -m openadapt_ml.benchmarks.cli logs --run -f
+
+# Show last N lines of execution logs
+uv run python -m openadapt_ml.benchmarks.cli logs --run --tail 100
+
 # Check WAA server status (probe endpoint)
 uv run python -m openadapt_ml.benchmarks.cli probe
 

From 1e821768cf2c9f91764bc48049c3d1f5f120f2ef Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 18:06:43 -0500
Subject: [PATCH 12/21] docs(readme): add example output for logs commands

- Add example output for `logs` (container status)
- Add example output for `logs --run -f` (benchmark execution)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/README.md b/README.md
index e208c67..d5712b3 100644
--- a/README.md
+++ b/README.md
@@ -841,7 +841,47 @@ uv run python -m openadapt_ml.benchmarks.cli logs --run -f
 
 # Show last N lines of execution logs
 uv run python -m openadapt_ml.benchmarks.cli logs --run --tail 100
+```
+
+**Example: Container status (`logs`)**
+```
+WAA Status (20.12.180.208)
+============================================================
+
+[Docker Images]
+REPOSITORY              TAG       SIZE
+waa-auto                latest    25.4GB
+windowsarena/winarena   latest    25.8GB
+
+[Container]
+  Status: Up 49 minutes
+
+[Storage]
+  Total: 21G
+  Disk image: 64G
+
+[QEMU VM]
+  Status: Running (PID 1471)
+  CPU: 176%, MEM: 51.6%, Uptime: 47:28
 
+[WAA Server]
+  "status": "Probe successful"
+ (READY)
+```
+
+**Example: Benchmark execution logs (`logs --run -f`)**
+```
+Run log: /home/azureuser/cli_logs/run_20260128_175507.log
+------------------------------------------------------------
+Streaming log (Ctrl+C to stop)...
+
+[2026-01-28 23:05:10,303 INFO agent/401-MainProcess] Thinking...
+[2026-01-28 23:05:17,318 INFO python/62-MainProcess] Updated computer successfully
+[2026-01-28 23:05:17,318 INFO lib_run_single/56-MainProcess] Step 9: computer.window_manager.switch_to_application("Summer Trip - File Explorer")
+```
+
+**Other useful commands:**
+```bash
 # Check WAA server status (probe endpoint)
 uv run python -m openadapt_ml.benchmarks.cli probe
 

From 0c699ed8a202e6715e96f0d6abf30adf078bd212 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 18:28:59 -0500
Subject: [PATCH 13/21] feat(cli): add --progress flag for benchmark ETA

- Add _show_benchmark_progress() function
- Parse run logs for completed task count
- Calculate elapsed time and estimated remaining
- Show progress percentage

Example usage:
  uv run python -m openadapt_ml.benchmarks.cli logs --progress

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md                      | 21 +++++++++
 openadapt_ml/benchmarks/cli.py | 81 ++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/README.md b/README.md
index d5712b3..2c5164f 100644
--- a/README.md
+++ b/README.md
@@ -841,6 +841,9 @@ uv run python -m openadapt_ml.benchmarks.cli logs --run -f
 
 # Show last N lines of execution logs
 uv run python -m openadapt_ml.benchmarks.cli logs --run --tail 100
+
+# Show benchmark progress and ETA
+uv run python -m openadapt_ml.benchmarks.cli logs --progress
 ```
 
 **Example: Container status (`logs`)**
@@ -880,6 +883,24 @@ Streaming log (Ctrl+C to stop)...
 [2026-01-28 23:05:17,318 INFO lib_run_single/56-MainProcess] Step 9: computer.window_manager.switch_to_application("Summer Trip - File Explorer")
 ```
 
+**Example: Benchmark progress (`logs --progress`)**
+```
+=== WAA Benchmark Progress ===
+
+Log: /home/azureuser/cli_logs/run_20260128_175507.log
+Started: 2026-01-28 22:55:14
+Latest:  2026-01-28 23:28:37
+
+Tasks completed: 1 / 154
+Elapsed: 33 minutes
+
+Avg time per task: ~33 min
+Remaining tasks: 153
+Estimated remaining: ~84h 9m
+
+Progress: 0% [1/154]
+```
+
 **Other useful commands:**
 ```bash
 # Check WAA server status (probe endpoint)
diff --git a/openadapt_ml/benchmarks/cli.py b/openadapt_ml/benchmarks/cli.py
index 0a8e3bc..caf0567 100644
--- a/openadapt_ml/benchmarks/cli.py
+++ b/openadapt_ml/benchmarks/cli.py
@@ -1462,6 +1462,76 @@ def cmd_vnc(args):
     return 0
 
 
+def _show_benchmark_progress(ip: str) -> int:
+    """Show benchmark progress with estimated completion time.
+
+    Parses the run log to count completed tasks and estimate remaining time.
+    """
+    # Find the most recent run log
+    result = ssh_run(ip, "ls -t /home/azureuser/cli_logs/run_*.log 2>/dev/null | head -1")
+    log_file = result.stdout.strip()
+
+    if not log_file:
+        print("No benchmark running. Start one with: run --num-tasks N")
+        return 1
+
+    # Get task count and timestamps
+    result = ssh_run(
+        ip,
+        f"""
+        echo "=== WAA Benchmark Progress ==="
+        echo ""
+
+        # Count completed tasks (each "Result:" line = 1 task done)
+        COMPLETED=$(grep -c "Result:" {log_file} 2>/dev/null || echo 0)
+        # Count total tasks from task list (sum of all domain counts)
+        TOTAL=$(grep -A20 "Left tasks:" {log_file} | grep -E "^[a-z_]+: [0-9]+" | awk -F': ' '{{sum+=$2}} END {{print sum}}')
+        [ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ] && TOTAL=154
+
+        # Get timestamps
+        FIRST_TS=$(grep -oE '\\[2026-[0-9-]+ [0-9:]+' {log_file} | head -1 | tr -d '[')
+        LAST_TS=$(grep -oE '\\[2026-[0-9-]+ [0-9:]+' {log_file} | tail -1 | tr -d '[')
+
+        echo "Log: {log_file}"
+        echo "Started: $FIRST_TS"
+        echo "Latest:  $LAST_TS"
+        echo ""
+        echo "Tasks completed: $COMPLETED / $TOTAL"
+
+        # Calculate elapsed minutes
+        if [ -n "$FIRST_TS" ] && [ -n "$LAST_TS" ]; then
+            START_H=$(echo "$FIRST_TS" | awk '{{print $2}}' | cut -d: -f1)
+            START_M=$(echo "$FIRST_TS" | awk '{{print $2}}' | cut -d: -f2)
+            NOW_H=$(echo "$LAST_TS" | awk '{{print $2}}' | cut -d: -f1)
+            NOW_M=$(echo "$LAST_TS" | awk '{{print $2}}' | cut -d: -f2)
+
+            ELAPSED_MIN=$(( (NOW_H - START_H) * 60 + (NOW_M - START_M) ))
+            echo "Elapsed: $ELAPSED_MIN minutes"
+
+            if [ "$COMPLETED" -gt 0 ] && [ "$ELAPSED_MIN" -gt 0 ]; then
+                MIN_PER_TASK=$((ELAPSED_MIN / COMPLETED))
+                REMAINING=$((TOTAL - COMPLETED))
+                EST_MIN=$((REMAINING * MIN_PER_TASK))
+                EST_H=$((EST_MIN / 60))
+                EST_M=$((EST_MIN % 60))
+
+                echo ""
+                echo "Avg time per task: ~$MIN_PER_TASK min"
+                echo "Remaining tasks: $REMAINING"
+                echo "Estimated remaining: ~${{EST_H}}h ${{EST_M}}m"
+
+                # Progress bar
+                PCT=$((COMPLETED * 100 / TOTAL))
+                echo ""
+                echo "Progress: $PCT% [$COMPLETED/$TOTAL]"
+            fi
+        fi
+        """,
+    )
+    print(result.stdout)
+    return 0
+
+
 def _show_run_logs(ip: str, follow: bool = False, tail: Optional[int] = None) -> int:
     """Show the most recent run command log file.
 
@@ -1523,12 +1593,17 @@ def cmd_logs(args):
     Default behavior shows all relevant logs (docker, storage, probe status).
     Use --follow to stream docker logs continuously.
     Use --run to show run command output instead of container logs.
+    Use --progress to show benchmark progress and ETA.
     """
     ip = get_vm_ip()
     if not ip:
         print("ERROR: VM not found")
         return 1
 
+    # Handle --progress flag: show benchmark progress
+    if getattr(args, "progress", False):
+        return _show_benchmark_progress(ip)
+
     # Handle --run flag: show run command output
     if args.run:
         return _show_run_logs(ip, args.follow, args.tail)
@@ -1864,6 +1939,12 @@ def main():
         action="store_true",
         help="Show run command output instead of container logs",
     )
+    p_logs.add_argument(
+        "--progress",
+        "-p",
+        action="store_true",
+        help="Show benchmark progress and estimated completion time",
+    )
     p_logs.set_defaults(func=cmd_logs)
 
     # exec

From 06dd11790cc24298202810036c0b92fa0fc387b8 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 18:31:53 -0500
Subject: [PATCH 14/21] docs(research): add cua.ai vs openadapt-ml WAA
 comparison

Comprehensive analysis of Cua (YC X25) computer-use agent platform:
- Architecture comparison (composite agents, sandbox-first)
- Benchmark framework differences (cua-bench vs openadapt-evals)
- Training data generation (trajectory replotting)
- Recommendations: adopt patterns, not full migration

Key findings:
- Cua's parallelization uses multiple sandboxes (like our multi-VM plan)
- Composite agent pattern could reduce API costs
- HTML capture enables training data diversity

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 docs/research/cua_waa_comparison.md | 532 ++++++++++++++++++++++++++++
 1 file changed, 532 insertions(+)
 create mode 100644 docs/research/cua_waa_comparison.md

diff --git a/docs/research/cua_waa_comparison.md b/docs/research/cua_waa_comparison.md
new file mode 100644
index 0000000..8c38570
--- /dev/null
+++ b/docs/research/cua_waa_comparison.md
@@ -0,0 +1,532 @@
+# Cua vs OpenAdapt-ML Windows Agent Arena (WAA) Implementation Comparison
+
+**Date**: 2026-01-28 (Updated)
+**Status**: Research Analysis
+**Author**: Research Agent
+
+---
+
+## Quick Reference: Key Metrics
+
+| Metric | Cua/OpenAI CUA | OpenAdapt-ML | Microsoft WAA (Navi) |
+|--------|----------------|--------------|----------------------|
+| WAA Success Rate | N/A (OSWorld: 38.1%) | In progress | 19.5% (GPT-4V) |
+| OSWorld Success Rate | 38.1% (OpenAI CUA) | Not implemented | N/A |
+| Human Baseline | 72-74.5% | 74.5% (WAA) | 74.5% |
+| VM Setup Time | Minutes (Lume) | ~15-20 min (Azure) | ~20 min |
+| Primary Platform | macOS (Apple Silicon) | Windows (Azure) | Windows (Azure) |
+
+---
+
+## Executive Summary
+
+This document analyzes [Cua (trycua/cua)](https://github.com/trycua/cua), a YC X25-backed open-source platform for Computer-Use Agents, and compares it with our OpenAdapt-Evals/OpenAdapt-ML two-package architecture.
+
+**Key Finding**: Cua represents a significantly more comprehensive infrastructure platform that addresses many problems we've been solving piecemeal. However, adopting Cua wholesale would require substantial architectural changes and has notable trade-offs around Windows/Azure focus, Apple Silicon dependency, and our training pipeline integration.
+
+**Recommendation**: Consider incremental adoption of Cua components, starting with cua-bench adapters for benchmark standardization, rather than full migration.
+
+---
+
+## 1. What is Cua?
+
+### Overview
+
+Cua ("koo-ah") is an open-source infrastructure platform for developing, evaluating, and deploying Computer-Use Agents. According to their [Hacker News launch](https://news.ycombinator.com/item?id=46768906) and [HuggingFace blog](https://huggingface.co/blog/cua-ai/cua-bench):
+
+> "Cua is Docker for Computer-Use AI Agents - it enables AI agents to control full operating systems in virtual containers and deploy them locally or to the cloud."
+
+### Core Components
+
+The Cua ecosystem is organized as a monorepo with these key packages:
+
+| Package | Purpose | Tech Stack |
+|---------|---------|------------|
+| **cua-agent** | AI agent framework for computer-use tasks | Python |
+| **cua-computer** | SDK for controlling desktop environments | Python |
+| **cua-computer-server** | Sandbox driver for UI interactions | Python/FastAPI |
+| **cua-bench** | Benchmarks and RL environments | Python |
+| **lume** | macOS/Linux VM management on Apple Silicon | Swift/CLI |
+| **lumier** | Docker-compatible interface for Lume VMs | Python |
+| **som** | Set-of-Mark for OmniParser integration | Python |
+| **pylume** | Python bindings for Lume | Python |
+| **mcp-server** | Multi-Modal Control Protocol server for Claude Desktop | Python |
+
+### Key Capabilities
+
+1. **Multi-Platform Virtualization**:
+   - macOS/Linux via Apple Virtualization Framework (97% native CPU speed on Apple Silicon)
+   - Windows via Docker/QEMU
+   - Cloud deployment support
+
+2. **Composite Agents Architecture**:
+   - Separate grounding model (fast, small) from reasoning model (large)
+   - Model-agnostic: supports Anthropic, OpenAI, Google, Ollama, LM Studio
+
+3. **Unified Benchmark Framework (cua-bench)**:
+   - Adapters for OSWorld, ScreenSpot, WindowsArena
+   - Trajectory export for training
+   - RL environment support
+
+4. **Training Data Generation**:
+   - "Trajectory replotting": Record 1 demo, render across 10 OS themes = 10 training trajectories
+   - HTML snapshots with bounding boxes, not just screenshots
+   - Multi-resolution (640x480 to 3440x1440)
+
+---
+
+## 2. Cua's Approach to Computer Use Automation
+
+### Architecture Philosophy
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                         Cua Platform                             │
+├─────────────────────────────────────────────────────────────────┤
+│  Agent Layer (cua-agent)                                        │
+│  ├── ComputerAgent - Main agent class                          │
+│  ├── Provider adapters (Anthropic, OpenAI, Ollama, etc.)       │
+│  └── Composite agents (grounding + reasoning split)            │
+├─────────────────────────────────────────────────────────────────┤
+│  Computer Layer (cua-computer)                                  │
+│  ├── Computer class - Unified interface                         │
+│  ├── Display drivers (screen capture, coordinates)             │
+│  └── Input drivers (mouse, keyboard)                            │
+├─────────────────────────────────────────────────────────────────┤
+│  Sandbox Layer                                                   │
+│  ├── Lume (Apple Silicon VMs - macOS/Linux)                    │
+│  ├── Docker/QEMU (Windows, Linux)                               │
+│  └── Cloud containers (cua-cloud)                               │
+├─────────────────────────────────────────────────────────────────┤
+│  Benchmark Layer (cua-bench)                                    │
+│  ├── OSWorld adapter                                             │
+│  ├── WindowsArena adapter                                        │
+│  ├── ScreenSpot adapter                                          │
+│  └── Custom task definitions                                     │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### Key Technical Decisions
+
+1. **Sandbox-First**: Every agent runs in an isolated VM/container. This is non-negotiable for safety.
+
+2. **Playwright-Like API**: Tasks defined with declarative Python decorators:
+   ```python
+   @cb.setup_task
+   async def setup(env, scenario):
+       await env.spotify.open()
+       await env.spotify.create_playlist(scenario["playlist_name"])
+
+   @cb.solve_task
+   async def solve(env, scenario):
+       await env.spotify.search(scenario["song"])
+   ```
+
+3. **HTML + Screenshots**: Captures full HTML with bounding boxes, accessibility labels, and CSS - not just screenshots. This enables:
+   - Element-level grounding
+   - Style variation generation
+   - More robust training data
+
+4. **Shell Applications**: Simulated apps (Spotify, Slack clones) that run in lightweight webtops without VM overhead. Enables rapid iteration.
+
+---
+
+## 3. Comparison with Our WAA-Based Evaluation Setup
+
+### Our Current Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    OpenAdapt Ecosystem                           │
+├─────────────────────────────────────────────────────────────────┤
+│  openadapt-ml (Training)                                        │
+│  ├── training/ - VLM fine-tuning pipeline                      │
+│  ├── vlm/ - Model adapters (Qwen, API-based)                   │
+│  ├── baselines/ - Baseline model adapters                      │
+│  ├── benchmarks/cli.py - VM lifecycle management               │
+│  └── cloud/ - Lambda Labs, Azure ML                            │
+├─────────────────────────────────────────────────────────────────┤
+│  openadapt-evals (Evaluation)                                   │
+│  ├── agents/ - BenchmarkAgent implementations                  │
+│  │   ├── ApiAgent (Claude, GPT-5.1)                            │
+│  │   ├── PolicyAgent (trained models)                          │
+│  │   └── RetrievalAgent (demo-conditioned)                     │
+│  ├── adapters/ - Benchmark adapters                            │
+│  │   ├── WAAMockAdapter                                         │
+│  │   └── WAALiveAdapter                                         │
+│  └── benchmarks/ - Runner, viewer, Azure orchestration         │
+├─────────────────────────────────────────────────────────────────┤
+│  Infrastructure                                                  │
+│  ├── Azure VMs (Standard_D4ds_v5 with nested virt)             │
+│  ├── Docker + QEMU (Windows 11 Enterprise via WAA image)       │
+│  └── SSH tunnels for VNC/API access                            │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### Side-by-Side Comparison
+
+| Aspect | Cua | OpenAdapt-Evals/ML |
+|--------|-----|-------------------|
+| **Scope** | Full platform (sandboxes, SDKs, benchmarks, training) | Focused on evaluation + ML training |
+| **Sandbox Technology** | Lume (Apple Silicon) + Docker/QEMU | Azure VMs + Docker/QEMU |
+| **Primary Platform** | macOS first, then Linux/Windows | Windows first (WAA-focused) |
+| **Local Dev Experience** | Native macOS VMs on Apple Silicon | Requires Azure VM or local Docker |
+| **Benchmark Support** | OSWorld, ScreenSpot, WAA via adapters | WAA only (others planned) |
+| **Training Data Gen** | Built-in trajectory replotting | Manual demo collection |
+| **Agent Architecture** | Composite (grounding + reasoning) | Monolithic (single API call) |
+| **VM Performance** | 97% native on Apple Silicon | Nested virtualization overhead |
+| **Cloud Support** | cua-cloud (managed service coming) | Azure VMs, Lambda Labs for training |
+| **RL Support** | Native RL environments in cua-bench | Not implemented |
+| **Model Agnostic** | Yes (100+ providers) | Yes (Anthropic, OpenAI, local VLMs) |
+| **Package Count** | 8+ packages in monorepo | 2 packages |
+| **Dependencies** | Python 3.12+ required | Python 3.10+ |
+| **Lines of Code** | ~15K+ (estimated) | ~8K |
+| **Documentation** | Extensive (cua.ai/docs) | CLAUDE.md + README |
+| **Community** | YC-backed, active development | Internal OpenAdapt project |
+
+### Benchmark Framework Comparison
+
+#### cua-bench
+
+```python
+# Task definition
+@cb.tasks_config
+def config():
+    return {"scenarios": [{"playlist_name": "Workout", "song": "Eye of the Tiger"}, ...]}
+
+@cb.setup_task
+async def setup(env, scenario):
+    await env.spotify.create_playlist(scenario["playlist_name"])
+
+@cb.solve_task
+async def solve(env, scenario):
+    await env.spotify.search(scenario["song"])
+    await env.spotify.add_to_playlist(scenario["playlist_name"])
+
+@cb.evaluate_task
+async def evaluate(env, scenario):
+    playlist = await env.spotify.get_playlist(scenario["playlist_name"])
+    return scenario["song"] in playlist.songs
+```
+
+**Key Features**:
+- Declarative task definition
+- Scenario variation injection
+- Automatic trajectory recording
+- Shell application support (simulated apps)
+
+#### openadapt-evals
+
+```python
+# Task loaded from JSON
+adapter = WAALiveAdapter(server_url="http://vm:5000")
+task = adapter.load_task("notepad_1")
+
+# Agent interaction
+agent = ApiAgent(provider="anthropic")
+obs = adapter.reset(task)
+action = agent.act(obs, task)
+obs, done, info = adapter.step(action)
+result = adapter.evaluate(task)
+```
+
+**Key Features**:
+- Uses upstream WAA task definitions
+- HTTP adapter to WAA server
+- Execution trace collection
+- P0 demo persistence fix in ApiAgent
+
+---
+
+## 4. Key Differences in Architecture
+
+### 4.1 Sandbox Philosophy
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Sandboxes are the core primitive | VMs are infrastructure detail |
+| Local-first (Apple Silicon VMs) | Cloud-first (Azure VMs) |
+| Multiple sandbox types unified | Single sandbox type (WAA Docker) |
+| Safety is architectural constraint | Safety via SSH/isolation |
+
+**Implication**: Cua's sandbox-first design makes it safer and more portable, but requires Lume infrastructure which is Apple Silicon-only.
+
+### 4.2 Training Data Generation
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Trajectory replotting (1 demo → N variants) | Manual demo collection |
+| HTML + screenshots captured | Screenshots only in WAA |
+| Built-in visual diversity generation | No automatic variation |
+| Shell apps for fast iteration | Full VM required |
+
+**Implication**: Cua can generate significantly more diverse training data from fewer human demonstrations. This addresses the "10x performance variance across UI changes" problem they identified.
+
+### 4.3 Agent Architecture
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Composite agents (grounding + reasoning) | Monolithic agents |
+| Explicit OmniParser/SoM integration | SoM mode supported but not primary |
+| Cost-optimized (small model for grounding) | Full API call for each decision |
+
+**Implication**: Cua's composite approach could reduce API costs and improve grounding accuracy by using specialized models for each subtask.
+
+### 4.4 Benchmark Integration
+
+| Cua | OpenAdapt |
+|-----|-----------|
+| Unified adapter interface across benchmarks | WAA-specific adapter |
+| Native adapters for OSWorld, ScreenSpot, WAA | WAA only (others TODO) |
+| Benchmark-agnostic task format | BenchmarkTask dataclass |
+| RL environment support | Evaluation only |
+
+**Implication**: Cua already has the multi-benchmark support we're planning in REPO_CONSOLIDATION_PLAN.md.
+
+---
+
+## 5. Should We Adopt Cua or Parts of It?
+
+### Arguments FOR Adoption
+
+1. **Multi-Benchmark Support**: They've already built adapters for OSWorld, ScreenSpot, WAA - exactly what we need.
+
+2. **Training Data Generation**: Trajectory replotting would dramatically improve our training data diversity.
+
+3. **Active Development**: YC-backed with active community. They're solving the same problems we are.
+
+4. **Better Local Dev**: macOS VMs on Apple Silicon would enable faster iteration for Mac users.
+
+5. **RL Support**: Native RL environments would enable future research directions.
+
+6. **MCP Integration**: Claude Desktop integration via MCP server.
+
+### Arguments AGAINST Full Adoption
+
+1. **Apple Silicon Dependency**: Lume requires Apple Silicon. Our team uses Azure VMs which have no Apple Silicon equivalent.
+
+2. **Windows Focus Mismatch**: We're focused on Windows (WAA) for enterprise use cases. Cua is macOS-first.
+
+3. **Training Pipeline Integration**: Our training pipeline (openadapt-ml) is tightly integrated with openadapt-evals. Switching to cua-bench would require significant refactoring.
+
+4. **Operational Complexity**: 8+ packages vs our 2. More to learn and maintain.
+
+5. **Python 3.12+ Requirement**: We support Python 3.10+. Migration could break user environments.
+
+6. **Unproven at Scale**: Despite YC backing, it's still early-stage. Our WAA setup is battle-tested.
+
+7. **Azure VM Investment**: We've invested significant effort in Azure VM automation (PR #14). This would be partially wasted.
+
+---
+
+## 6. Trade-offs Analysis
+
+### Scenario A: Full Migration to Cua
+
+**Effort**: High (3-6 months)
+
+**Benefits**:
+- Unified multi-benchmark support
+- Training data generation
+- Active community support
+- MCP/Claude Desktop integration
+
+**Costs**:
+- Significant refactoring of openadapt-ml training pipeline
+- Azure VM automation work partially wasted
+- New learning curve for team
+- Potential compatibility issues with Python 3.10 users
+
+**Risk**: Medium-High (depending on Cua's stability and our ability to extend it)
+
+### Scenario B: Adopt cua-bench Adapters Only
+
+**Effort**: Medium (1-2 months)
+
+**Benefits**:
+- Standardized benchmark interface
+- Access to OSWorld, ScreenSpot adapters
+- Can still use our Azure VM infrastructure
+- Incremental migration path
+
+**Costs**:
+- Must maintain compatibility layer
+- Miss out on sandbox/Lume benefits
+- Partial adoption may cause confusion
+
+**Risk**: Low-Medium
+
+### Scenario C: Adopt Architectural Patterns Only
+
+**Effort**: Low (2-4 weeks)
+
+**Benefits**:
+- No external dependencies
+- Learn from their solutions
+- Can implement selectively
+
+**What to Adopt**:
+- Composite agent pattern (grounding + reasoning)
+- Trajectory replotting concept
+- Declarative task definition style
+- HTML capture alongside screenshots
+
+**Costs**:
+- Must implement ourselves
+- No community support
+
+**Risk**: Low
+
+### Scenario D: Stay Current Course
+
+**Effort**: None
+
+**Benefits**:
+- Known system, no learning curve
+- REPO_CONSOLIDATION_PLAN.md already addresses multi-benchmark support
+- Full control over architecture
+
+**Costs**:
+- Slower to add OSWorld, other benchmarks
+- No training data generation automation
+- Potentially duplicating work
+
+**Risk**: Low (but higher opportunity cost)
+
+---
+
+## 7. Recommendations
+
+### Immediate (Next 2-4 Weeks)
+
+1. **Do NOT migrate to Cua wholesale**. The Azure VM investment is too recent, and we have a working system.
+
+2. **Adopt the composite agent pattern** in ApiAgent:
+   - Add optional grounding model (OmniParser/SoM)
+   - Use small model for element detection, large model for reasoning
+   - This is an incremental change to existing code
+
+3. **Add HTML capture** to WAALiveAdapter:
+   - Capture accessibility tree alongside screenshots
+   - Enables future training data diversity
+
+### Medium-Term (Next 2-3 Months)
+
+4. **Evaluate cua-bench integration**:
+   - Test if cua-bench adapters can work with our evaluation runner
+   - If compatible, adopt their OSWorld/ScreenSpot adapters
+   - Keep our WAALiveAdapter for Azure VM compatibility
+
+5. **Implement trajectory replotting prototype**:
+   - Record demos with HTML + screenshots
+   - Test re-rendering across Windows themes
+   - Measure training data quality improvement
+
+### Long-Term (6+ Months)
+
+6. **Consider Lume for local development**:
+   - If team has Apple Silicon Macs
+   - Would enable faster local iteration
+   - Keep Azure VMs for CI/production
+
+7. **Contribute back to Cua**:
+   - Our Azure VM automation could benefit the community
+   - Windows-focused improvements
+
+---
+
+## 8. Specific Recommendations for REPO_CONSOLIDATION_PLAN.md
+
+Our current consolidation plan is **still valid** but should incorporate these learnings:
+
+1. **Keep the two-package split** (openadapt-evals + openadapt-ml). Cua's monorepo with 8+ packages is more complex than necessary for our use case.
+
+2. **Add benchmark adapter interface** compatible with cua-bench:
+   ```python
+   class BenchmarkAdapter(ABC):
+       # Our current interface is similar to cua-bench
+       # Add optional HTML capture in observations
+       # Add evaluation spec support
+   ```
+
+3. **Prioritize OSWorld adapter** as second benchmark (after WAA). Cua's OSWorld-Verified work validates this as the next target.
+
+4. **Consider shell applications** for testing:
+   - Simulated apps for unit tests
+   - No VM overhead for CI
+   - This is orthogonal to our VM-based evaluation
+
+5. **Document composite agent pattern** in CLAUDE.md for future implementation.
+
+---
+
+## 9. Conclusion
+
+Cua is an impressive and comprehensive platform that addresses many problems we're solving. However, full migration is not recommended at this time due to:
+
+1. Our recent Azure VM automation investment
+2. Apple Silicon dependency in Lume
+3. Windows-first focus vs their macOS-first approach
+
+Instead, we should:
+- **Learn from their architecture** (composite agents, trajectory replotting)
+- **Evaluate cua-bench adapters** for multi-benchmark support
+- **Stay on our current consolidation path** while incorporating their patterns
+
+The OpenAdapt ecosystem can achieve similar capabilities through incremental improvements rather than wholesale migration.
+
+---
+
+## 10. Appendix: Agent Loop Types in Cua
+
+Cua provides multiple agent loop implementations optimized for different use cases:
+
+| Agent Loop | Best For | Model Support |
+|------------|----------|---------------|
+| **AgentLoop.OPENAI** | Web-based tasks, browser automation | OpenAI models (requires Tier 3 access) |
+| **AgentLoop.ANTHROPIC** | Strong reasoning + computer-use | claude-3-5-sonnet, claude-3-7-sonnet |
+| **AgentLoop.UITARS** | OS/desktop tasks, latency-sensitive | UI-TARS-1.5 (local or HuggingFace) |
+| **AgentLoop.OMNI** | Maximum flexibility | Any vision-language model |
+
+### Composite Agent Example
+
+```python
+# Pair a grounding model with a reasoning model
+model = "huggingface-local/GTA1-7B+openai/gpt-4o"
+# GTA1-7B: precise click coordinates
+# GPT-4o: action planning and reasoning
+```
+
+---
+
+## 11. Appendix: OpenAdapt-ML Docker Setup Details
+
+Our current implementation uses a custom Dockerfile that:
+
+1. **Base**: `dockurr/windows:latest` (modern Windows ISO auto-download)
+2. **WAA Components**: Copied from `windowsarena/winarena:latest`
+3. **IP Patching**: Changes `20.20.20.21` to `172.30.0.2` for dockurr compatibility
+4. **Python**: Uses Python 3.9 from vanilla WAA for GroundingDINO compatibility
+5. **Automation**: FirstLogonCommands for firewall, WAA server auto-start
+
+Key environment variables:
+- `VERSION=11e` - Windows 11 Enterprise Evaluation
+- `RAM_SIZE=8G` / `16G` (fast mode)
+- `CPU_CORES=4` / `6` (fast mode)
+
+---
+
+## References
+
+- [Cua GitHub Repository](https://github.com/trycua/cua)
+- [Cua-Bench HuggingFace Blog](https://huggingface.co/blog/cua-ai/cua-bench)
+- [Show HN: Cua-Bench Discussion](https://news.ycombinator.com/item?id=46768906)
+- [Launch HN: Cua (YC X25)](https://news.ycombinator.com/item?id=43773563)
+- [Cua Documentation](https://cua.ai/docs)
+- [Cua Composite Agents Blog](https://www.trycua.com/blog/composite-agents)
+- [What is Lume?](https://cua.ai/docs/lume/guide/getting-started/introduction)
+- [OSWorld-Verified](https://xlang.ai/blog/osworld-verified)
+- [Windows Agent Arena](https://microsoft.github.io/WindowsAgentArena/)
+- [Windows Agent Arena Paper](https://arxiv.org/abs/2409.08264)
+- [OpenAI Computer-Using Agent](https://openai.com/index/computer-using-agent/)
+- [OpenAdapt REPO_CONSOLIDATION_PLAN.md](/Users/abrichr/oa/src/openadapt-ml/docs/REPO_CONSOLIDATION_PLAN.md)

From d9acbcfe2ece98f19dd9d37cfc044a97fdde6652 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 18:47:05 -0500
Subject: [PATCH 15/21] feat(cli): add parallelization support with --worker-id
 and --num-workers

WAA natively supports parallel execution by distributing tasks across workers.

Usage:
  # Run on single VM (default)
  run --num-tasks 154

  # Run in parallel on multiple VMs
  VM1: run --num-tasks 154 --worker-id 0 --num-workers 3
  VM2: run --num-tasks 154 --worker-id 1 --num-workers 3
  VM3: run --num-tasks 154 --worker-id 2 --num-workers 3

Tasks auto-distribute: worker 0 gets tasks 0-51, worker 1 gets 52-103, etc.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 openadapt_ml/benchmarks/cli.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/openadapt_ml/benchmarks/cli.py b/openadapt_ml/benchmarks/cli.py
index caf0567..a4a51c4 100644
--- a/openadapt_ml/benchmarks/cli.py
+++ b/openadapt_ml/benchmarks/cli.py
@@ -920,6 +920,14 @@ def cmd_run(args):
         f"--domain {domain}",
     ]
 
+    # Add parallelization flags if specified (argparse converts hyphens to underscores)
+    worker_id = getattr(args, "worker_id", 0)
+    num_workers = getattr(args, "num_workers", 1)
+    if num_workers > 1:
+        run_args.append(f"--worker_id {worker_id}")
+        run_args.append(f"--num_workers {num_workers}")
+        log("RUN", f"Parallel mode: worker {worker_id}/{num_workers}")
+
     # If specific task requested, create custom test config
     if task:
         create_custom_test_cmd = f'''
@@ -1895,6 +1903,18 @@ def main():
     p_run.add_argument(
         "--no-download", action="store_true", help="Skip downloading results"
     )
+    p_run.add_argument(
+        "--worker-id",
+        type=int,
+        default=0,
+        help="Worker ID for parallel execution (0-indexed)",
+    )
+    p_run.add_argument(
+        "--num-workers",
+        type=int,
+        default=1,
+        help="Total number of parallel workers",
+    )
     p_run.set_defaults(func=cmd_run)
 
     # download

From 075a35d454be13ee54222bbc731dff109d4b9436 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 18:47:49 -0500
Subject: [PATCH 16/21] docs(research): add market positioning and strategic
 differentiation

Expand cua_waa_comparison.md with:
- Success rate gap analysis (38.1% vs 19.5%)
- Market positioning comparison (TAM, buyers, value props)
- Where sandbox approach fails (Citrix, licensed SW, compliance)
- Shell applications convergence opportunities
- Bottom line: Windows enterprise automation is hard, validates OpenAdapt approach

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 docs/research/cua_waa_comparison.md | 79 ++++++++++++++++++++++++++++-
 1 file changed, 77 insertions(+), 2 deletions(-)

diff --git a/docs/research/cua_waa_comparison.md b/docs/research/cua_waa_comparison.md
index 8c38570..f4b12d0 100644
--- a/docs/research/cua_waa_comparison.md
+++ b/docs/research/cua_waa_comparison.md
@@ -477,7 +477,82 @@ The OpenAdapt ecosystem can achieve similar capabilities through incremental imp
 
 ---
 
-## 10. Appendix: Agent Loop Types in Cua
+## 10. Market Positioning and Strategic Differentiation
+
+### 10.1 The Success Rate Gap
+
+| Agent | Benchmark | Success Rate | Gap to Human |
+|-------|-----------|--------------|--------------|
+| OpenAI CUA | OSWorld | 38.1% | ~36 pts below human (74.5%) |
+| Microsoft Navi | WAA | 19.5% | ~55 pts below human (74.5%) |
+
+**Key insight**: The problem is far from solved. Both approaches have runway—the technology isn't mature enough for either to dominate yet.
+
+The 38.1% vs 19.5% gap is significant:
+- OSWorld is macOS/Linux focused
+- WAA is Windows focused
+- **Windows automation appears harder** (more legacy complexity, more app diversity)
+
+This validates OpenAdapt's focus: Windows enterprise workflows are the harder problem.
+
+### 10.2 Market Positioning
+
+| Aspect | Cua | OpenAdapt |
+|--------|-----|-----------|
+| **Primary TAM** | AI Agents / Developer Tools (~$500M-1B, 40%+ CAGR) | Enterprise RPA + Legacy Automation (~$8-10B, 20% CAGR) |
+| **Buyer** | ML engineers, AI researchers | Ops, IT, compliance, support |
+| **Value Prop** | "Build computer-use agents faster" | "Learn automation from how you already work" |
+
+### 10.3 Why These Markets Don't Fully Overlap
+
+- Cua assumes synthetic, controlled environments
+- OpenAdapt captures real workflows from production systems
+- Enterprise compliance requirements (HIPAA, SOX) favor retrospective capture
+
+### 10.4 Where Cua's Sandbox Approach Breaks Down
+
+Cua's sandbox-first design assumes you can:
+- Spin up a clean VM with the target app
+- Control the environment end-to-end
+- Reproduce the workflow deterministically
+
+**This fails for:**
+
+| Scenario | Why Sandboxes Fail | OpenAdapt Alternative |
+|----------|-------------------|----------------------|
+| **Citrix/RDP apps** | No local install possible | Capture remote session natively |
+| **Licensed enterprise software** | SAP, Epic, Oracle—can't sandbox without licensing | Record from licensed desktop |
+| **Policy-controlled desktops** | Enterprise IT won't allow arbitrary VMs | Capture from existing desktop |
+| **Compliance-restricted environments** | Healthcare, finance—can't replicate production | Retrospective recording allowed |
+| **Multi-app workflows** | Spanning 5+ apps that can't all be sandboxed together | Single recording captures all |
+
+**OpenAdapt's retrospective recording doesn't have these constraints.**
+
+### 10.5 Shell Applications: Where Cua and OpenAdapt Could Converge
+
+Shell apps (simulated Spotify, Slack clones) serve different purposes:
+
+| Use Case | Cua's Approach | OpenAdapt's Approach |
+|----------|---------------|---------------------|
+| Unit tests | Primary use case | Could adopt for CI |
+| Training data | Synthetic generation | Not applicable (need real data) |
+| Fast iteration | Core workflow | Could speed up agent logic dev |
+| Production eval | Not representative | Azure VMs remain primary |
+
+**Recommendation**: Adopt shell apps for regression testing agent logic, but never train on them. Real behavioral data from enterprise workflows remains the moat.
+
+### 10.6 Bottom Line
+
+The 19.5% WAA success rate validates OpenAdapt's approach:
+- Windows enterprise automation is hard
+- Current agents fail often
+- Learning from real human demonstrations is one path to improvement
+
+Cua's strength (macOS VMs at 97% native speed) doesn't help with SAP, Citrix, or legacy Win32 apps—exactly where OpenAdapt focuses.
+
+---
+
+## 12. Appendix: Agent Loop Types in Cua
 
 Cua provides multiple agent loop implementations optimized for different use cases:
 
@@ -499,7 +574,7 @@ model = "huggingface-local/GTA1-7B+openai/gpt-4o"
 
 ---
 
-## 11. Appendix: OpenAdapt-ML Docker Setup Details
+## 13. Appendix: OpenAdapt-ML Docker Setup Details
 
 Our current implementation uses a custom Dockerfile that:
 

From 60227725468a45c0a0463bc534065b52e44f4efe Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 23:38:13 -0500
Subject: [PATCH 17/21] docs(waa): add parallelization and scalable benchmark
 design docs

- Add WAA_PARALLELIZATION_DESIGN.md documenting:
  - Official WAA approach (Azure ML Compute)
  - Our dedicated VM approach (dev/debug)
  - When to use each approach

- Add WAA_UNATTENDED_SCALABLE.md documenting:
  - Goal: unattended, scalable, programmatic WAA
  - Synthesized approach using official run_azure.py
  - Implementation plan and cost estimates

- Update Dockerfile comments to clarify:
  - API agents (api-claude, api-openai) run externally
  - openadapt-evals CLI connects via SSH tunnel
  - No internal run.py patching needed

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 docs/WAA_PARALLELIZATION_DESIGN.md            | 331 ++++++++++++++++++
 docs/WAA_UNATTENDED_SCALABLE.md               | 298 ++++++++++++++++
 openadapt_ml/benchmarks/waa_deploy/Dockerfile |  11 +-
 3 files changed, 634 insertions(+), 6 deletions(-)
 create mode 100644 docs/WAA_PARALLELIZATION_DESIGN.md
 create mode 100644 docs/WAA_UNATTENDED_SCALABLE.md

diff --git a/docs/WAA_PARALLELIZATION_DESIGN.md b/docs/WAA_PARALLELIZATION_DESIGN.md
new file mode 100644
index 0000000..40d7684
--- /dev/null
+++ b/docs/WAA_PARALLELIZATION_DESIGN.md
@@ -0,0 +1,331 @@
+# WAA Benchmark Parallelization Design
+
+**Last Updated:** 2026-01-29
+
+## Overview
+
+This document describes two approaches for running Windows Agent Arena (WAA) benchmarks:
+
+1. **Dedicated VM Approach** (our current setup) - For development, debugging, small runs
+2. **Azure ML Compute Approach** (official WAA) - For full benchmark runs at scale
+
+## Official WAA Approach: Azure ML Compute
+
+The official WAA repository uses Azure ML Compute Instances for parallelization.
+
+**Source:** [README.md](https://github.com/microsoft/WindowsAgentArena/blob/main/README.md)
+> "WAA supports the deployment of agents **at scale** using the Azure ML cloud infrastructure, allowing for the parallel running of multiple agents and delivering quick benchmark results for hundreds of tasks in minutes, not days."
+
+**Implementation:** [scripts/run_azure.py](https://github.com/microsoft/WindowsAgentArena/blob/main/scripts/run_azure.py)
+
+```python
+# Official WAA creates Azure ML Compute Instances
+from azure.ai.ml.entities import ComputeInstance
+
+compute_instance = ComputeInstance(
+    name=f"w{worker_id}Exp{exp_name}",
+    size="Standard_D8_v3",  # 8 vCPU, nested virtualization
+    setup_scripts=setup_scripts,
+    idle_time_before_shutdown_minutes=600,
+    ssh_public_access_enabled=True
+)
+ml_client.begin_create_or_update(compute_instance).result()
+
+# Uses multiprocessing.Process for parallel workers
+for worker_id in range(num_workers):
+    p = Process(target=launch_vm_and_job, args=(worker_id, ...))
+    processes.append(p)
+    p.start()
+```
+
+---
+
+## Our Approach: Dedicated Azure VM
+
+We use a single dedicated Azure VM for development and debugging.
+
+### Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│                         LOCAL MACHINE                                    │
+│                                                                          │
+│  openadapt-ml CLI                                                        │
+│  ├── SSH tunnel for VNC (localhost:8006 → VM:8006)                      │
+│  ├── SSH tunnel for WAA API (localhost:5001 → VM:5000)                  │
+│  └── Direct SSH for commands                                             │
+│                                                                          │
+└─────────────────────────────────┬───────────────────────────────────────┘
+                                  │
+                                  ▼
+                           ┌─────────────┐
+                           │ waa-eval-vm │
+                           │ D4ds_v4     │
+                           │             │
+                           │ Docker      │
+                           │   └─QEMU    │
+                           │     └─Win11 │
+                           │       └─WAA │
+                           └─────────────┘
+```
+
+---
+
+## When to Use Which Approach
+
+| Use Case | Dedicated VM | Azure ML Compute |
+|----------|--------------|------------------|
+| **Development/debugging** | ✅ Better - VNC, SSH, full control | ❌ Harder to debug |
+| **Single task testing** | ✅ Simpler | ❌ Overkill |
+| **Quick iteration** | ✅ VM stays running | ❌ Compute instances spin up/down |
+| **Cost for small runs** | ✅ One VM, pay as you go | ❌ ML workspace overhead |
+| **Parallel at scale (40+ workers)** | ❌ Manual VM management | ✅ Designed for this |
+| **Full 154-task benchmark** | ❌ ~5 hours sequential | ✅ ~30 min with 10 workers |
+
+**Recommendation:**
+- Use **dedicated VM** for development and debugging
+- Use **Azure ML Compute** (official approach) for full benchmark runs
+
+---
+
+## Dedicated VM Details
+
+### Current Setup
+
+- **VM Name:** `waa-eval-vm`
+- **Size:** `Standard_D4ds_v4` (4 vCPU, 16GB RAM, nested virtualization)
+- **IP:** 20.12.180.208
+- **OS:** Ubuntu 22.04 LTS
+- **Software:** Docker with `windowsarena/winarena:latest`
+
+### CLI Commands
+
+```bash
+# VM management
+uv run python -m openadapt_ml.benchmarks.cli create      # Create VM
+uv run python -m openadapt_ml.benchmarks.cli status      # Check status
+uv run python -m openadapt_ml.benchmarks.cli probe       # Check WAA server
+uv run python -m openadapt_ml.benchmarks.cli vnc         # Open VNC tunnel
+uv run python -m openadapt_ml.benchmarks.cli logs        # View logs
+uv run python -m openadapt_ml.benchmarks.cli deallocate  # Stop billing
+uv run python -m openadapt_ml.benchmarks.cli delete      # Delete VM
+```
+
+### Access
+
+- **VNC:** http://localhost:8006 (via SSH tunnel)
+- **SSH:** `ssh azureuser@20.12.180.208`
+
+---
+
+## Azure ML Compute Details (Official WAA)
+
+### Setup Requirements
+
+1. Azure subscription with ML workspace
+2. Storage account for golden image
+3. Compute instance startup script
+4. vCPU quota (8 vCPU per worker × N workers)
+
+### Running Official WAA at Scale
+
+```bash
+cd WindowsAgentArena
+
+# Run with 10 workers
+python scripts/run_azure.py \
+    --num_workers 10 \
+    --agent navi \
+    --model_name gpt-4o \
+    --json_name evaluation_examples_windows/test_all.json
+```
+
+### Cost Estimate (Azure ML)
+
+| Workers | VM Size | vCPUs Each | Total vCPUs | Time for 154 tasks | Est. Cost |
+|---------|---------|------------|-------------|-------------------|-----------|
+| 1 | D8_v3 | 8 | 8 | ~5 hours | ~$2 |
+| 5 | D8_v3 | 8 | 40 | ~1 hour | ~$2 |
+| 10 | D8_v3 | 8 | 80 | ~30 min | ~$2 |
+
+---
+
+## Components
+
+### 1. Dedicated Azure VMs
+
+Each VM is identical:
+- **Size:** `Standard_D4ds_v4` (4 vCPU, 16GB RAM, nested virtualization)
+- **OS:** Ubuntu 22.04 LTS
+- **Software:** Docker with `windowsarena/winarena:latest` image
+- **Inside Docker:** QEMU running Windows 11 with WAA Flask server
+
+### 2. Task Distribution
+
+- 154 total WAA tasks
+- Tasks distributed round-robin across N VMs
+- Each VM runs tasks sequentially (WAA limitation - one Windows instance per container)
+- No inter-VM communication needed (embarrassingly parallel)
+
+### 3. Orchestration (ThreadPoolExecutor)
+
+```python
+# Simplified pseudocode
+with ThreadPoolExecutor(max_workers=N) as executor:
+    # Phase 1: Create VMs in parallel
+    vm_futures = [executor.submit(create_vm, f"waa-eval-vm-{i}") for i in range(N)]
+    vms = [f.result() for f in vm_futures]
+
+    # Phase 2: Distribute tasks
+    task_assignments = distribute_tasks(tasks, vms)  # round-robin
+
+    # Phase 3: Run tasks in parallel (one thread per VM)
+    result_futures = [
+        executor.submit(run_tasks_on_vm, vm, assigned_tasks)
+        for vm, assigned_tasks in task_assignments
+    ]
+    results = [f.result() for f in result_futures]
+
+    # Phase 4: Cleanup VMs
+    for vm in vms:
+        executor.submit(delete_vm, vm)
+```
+
+## Tradeoffs: Dedicated VM vs Azure ML Compute
+
+| Aspect | Dedicated VM (Our Approach) | Azure ML Compute (Official WAA) |
+|--------|----------------------------|--------------------------------|
+| **Best for** | Development, debugging, small runs | Full benchmark at scale |
+| **Simplicity** | Simple Azure CLI | Complex ML SDK |
+| **Control** | Full control, VNC, SSH | Managed (less visibility) |
+| **Debugging** | Easy - VNC shows Windows | Harder - logs only |
+| **Parallelization** | Manual (multiple VMs) | Built-in (num_workers flag) |
+| **Cost** | Pay for VM only | VM + ML workspace |
+| **Dependencies** | Azure CLI | Azure ML SDK, workspace, storage |
+
+**Decision:** Use BOTH approaches for different purposes.
+
+## VM Lifecycle
+
+```
+┌──────────┐     ┌──────────┐     ┌──────────┐     ┌──────────┐
+│  CREATE  │────▶│  SETUP   │────▶│   RUN    │────▶│  DELETE  │
+└──────────┘     └──────────┘     └──────────┘     └──────────┘
+     │                │                │                │
+     ▼                ▼                ▼                ▼
+  az vm create    docker pull      run.py           az vm delete
+  ~2 min          winarena:latest  tasks            ~1 min
+                  Windows boot     ~2 min/task
+                  ~15 min (first)
+                  ~3 min (cached)
+```
+
+### Optimization: Pre-warmed VM Pool
+
+To avoid 15-minute first-boot time:
+1. Create VMs once with Windows installed
+2. **Deallocate** (stops billing, preserves disk)
+3. **Start** when needed (~2 min)
+4. Run tasks
+5. **Deallocate** again (not delete)
+
+```bash
+# Initial setup (once)
+uv run python -m openadapt_ml.benchmarks.cli create --name waa-eval-vm-1
+uv run python -m openadapt_ml.benchmarks.cli create --name waa-eval-vm-2
+# ... wait for Windows to install on each ...
+
+# Before benchmark run
+uv run python -m openadapt_ml.benchmarks.cli vm-start --name waa-eval-vm-1
+uv run python -m openadapt_ml.benchmarks.cli vm-start --name waa-eval-vm-2
+
+# After benchmark run (stops billing, keeps disk)
+uv run python -m openadapt_ml.benchmarks.cli deallocate --name waa-eval-vm-1
+uv run python -m openadapt_ml.benchmarks.cli deallocate --name waa-eval-vm-2
+```
+
+## Scaling Considerations
+
+### Azure vCPU Quota
+
+| VM Size | vCPUs | Max VMs (10 vCPU quota) | Max VMs (40 vCPU quota) |
+|---------|-------|-------------------------|-------------------------|
+| D4ds_v4 | 4 | 2 | 10 |
+| D2ds_v4 | 2 | 5 | 20 |
+
+**Current quota:** 10 vCPUs (Standard D Family)
+**Recommended:** Request increase to 40+ vCPUs for 10 parallel VMs
+
+### Cost Estimate
+
+| Workers | VM Size | $/hr each | Total $/hr | 154 tasks @ 2min/task | Total Cost |
+|---------|---------|-----------|------------|----------------------|------------|
+| 1 | D4ds_v4 | $0.19 | $0.19 | 5.1 hrs | ~$1.00 |
+| 5 | D4ds_v4 | $0.19 | $0.95 | 1.0 hr | ~$1.00 |
+| 10 | D4ds_v4 | $0.19 | $1.90 | 0.5 hr | ~$1.00 |
+
+**Note:** More workers = faster completion, similar total cost (dominated by compute time, not wall time).
+
+## CLI Commands (Proposed)
+
+```bash
+# Create a pool of VMs
+uv run python -m openadapt_ml.benchmarks.cli pool create --count 5
+
+# Start all VMs in pool
+uv run python -m openadapt_ml.benchmarks.cli pool start
+
+# Run benchmark across pool
+uv run python -m openadapt_ml.benchmarks.cli run --parallel --tasks 154
+
+# Deallocate pool (stop billing)
+uv run python -m openadapt_ml.benchmarks.cli pool deallocate
+
+# Delete pool entirely
+uv run python -m openadapt_ml.benchmarks.cli pool delete
+```
+
+## Implementation Plan
+
+### Phase 1: Single Dedicated VM (DONE)
+- [x] Create VM with CLI (`uv run python -m openadapt_ml.benchmarks.cli create`)
+- [x] Run WAA benchmarks on single VM
+- [x] VNC access for debugging
+- [x] Results collection
+
+### Phase 2: Scale with Official WAA (TODO)
+- [ ] Set up Azure ML workspace (if not exists)
+- [ ] Upload golden image to storage account
+- [ ] Configure `scripts/run_azure.py` with our credentials
+- [ ] Request vCPU quota increase (80+ for 10 workers)
+- [ ] Run full 154-task benchmark with `--num_workers 10`
+
+### Phase 3: Integration (OPTIONAL)
+- [ ] Wrapper CLI to invoke official `run_azure.py`
+- [ ] Results download and analysis
+- [ ] Cost tracking
+
+**Note:** We're NOT building our own VM pool management. The official WAA `run_azure.py` already does this well.
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `openadapt_ml/benchmarks/cli.py` | CLI for single dedicated VM (dev/debug) |
+| `vendor/WindowsAgentArena/scripts/run_azure.py` | Official WAA parallel execution |
+
+## Related Documents
+
+- `docs/WAA_APPROACH_REVIEW.md` - Why vanilla WAA, not custom Dockerfile
+- `CLAUDE.md` - CLI-first development guidelines
+- `/Users/abrichr/oa/src/STATUS.md` - Project priorities
+- [Official WAA README](https://github.com/microsoft/WindowsAgentArena/blob/main/README.md) - Azure ML setup instructions
+
+## Decision Log
+
+| Date | Decision | Rationale |
+|------|----------|-----------|
+| 2026-01-29 | Use dedicated VM for dev/debug | Full control, VNC, easy iteration |
+| 2026-01-29 | Use official WAA `run_azure.py` for scale | Don't reinvent the wheel |
+| 2026-01-29 | Don't build custom VM pool | Official WAA already handles this |
+| 2026-01-29 | ThreadPoolExecutor sufficient | Ray is overkill (agent a7d43c3 analysis) |
diff --git a/docs/WAA_UNATTENDED_SCALABLE.md b/docs/WAA_UNATTENDED_SCALABLE.md
new file mode 100644
index 0000000..2de68c3
--- /dev/null
+++ b/docs/WAA_UNATTENDED_SCALABLE.md
@@ -0,0 +1,298 @@
+# Unattended Scalable Programmatic WAA
+
+**Last Updated:** 2026-01-29
+
+## Goal
+
+Run Windows Agent Arena (WAA) benchmark with:
+- **Unattended**: No manual intervention (Windows auto-installs, server auto-starts)
+- **Scalable**: N parallel workers (10+ for full 154-task benchmark in ~30 min)
+- **Programmatic**: Single command execution
+
+## Current State
+
+### What Official WAA Provides
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│  LOCAL: python scripts/run_azure.py --num_workers 10                    │
+└─────────────────────────────────┬───────────────────────────────────────┘
+                                  │
+                                  ▼
+┌─────────────────────────────────────────────────────────────────────────┐
+│                     AZURE ML WORKSPACE                                   │
+│                                                                          │
+│  ┌──────────┐  ┌──────────┐  ┌──────────┐       ┌──────────┐           │
+│  │ Compute  │  │ Compute  │  │ Compute  │  ...  │ Compute  │           │
+│  │ Instance │  │ Instance │  │ Instance │       │ Instance │           │
+│  │ Worker 0 │  │ Worker 1 │  │ Worker 2 │       │ Worker N │           │
+│  └────┬─────┘  └────┬─────┘  └────┬─────┘       └────┬─────┘           │
+│       │             │             │                   │                 │
+│       ▼             ▼             ▼                   ▼                 │
+│  ┌─────────────────────────────────────────────────────────────────┐   │
+│  │  Each instance runs: Docker → QEMU → Windows → WAA → Navi       │   │
+│  └─────────────────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+**Pros:**
+- ✅ Parallelization built-in (`--num_workers N`)
+- ✅ Azure ML handles compute lifecycle
+- ✅ Auto-shutdown on idle
+- ✅ Results to Azure Storage
+
+**Cons:**
+- ❌ Only supports Navi agent (not our API agents)
+- ❌ Requires pre-uploaded golden image to Azure Storage
+- ❌ Complex Azure ML setup (workspace, storage, startup script)
+- ❌ Limited debugging (no VNC)
+
+### What We Built
+
+| Component | Purpose | Useful? |
+|-----------|---------|---------|
+| `waa_deploy/Dockerfile` | Auto-download Windows, API agent support | ✅ For dev |
+| `waa_deploy/api_agent.py` | Claude/OpenAI agent (alternative to Navi) | ✅ Key differentiator |
+| `cli.py` | Dedicated VM management | ✅ For dev/debug |
+| `WAALiveAdapter` | Connects to WAA server API | ✅ Portable |
+| `ApiAgent` | Structured actions via LLM API | ✅ Portable |
+
+---
+
+## Synthesized Approach
+
+### Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│  LOCAL: uv run python -m openadapt_ml.benchmarks.cli scale              │
+│           --workers 10 --agent api-openai --tasks 154                   │
+└─────────────────────────────────┬───────────────────────────────────────┘
+                                  │
+                    ┌─────────────┴─────────────┐
+                    │   Use official run_azure.py│
+                    │   for compute orchestration│
+                    └─────────────┬─────────────┘
+                                  │
+                                  ▼
+┌─────────────────────────────────────────────────────────────────────────┐
+│                     AZURE ML COMPUTE INSTANCES                           │
+│                                                                          │
+│  Each instance runs our modified Docker image:                          │
+│  ┌─────────────────────────────────────────────────────────────────┐   │
+│  │  waa-auto:latest (our Dockerfile)                                │   │
+│  │  ├── dockurr/windows (auto-downloads Windows 11)                │   │
+│  │  ├── windowsarena/winarena components                           │   │
+│  │  ├── api_agent.py (Claude/OpenAI support)                       │   │
+│  │  └── Auto-start WAA server on boot                              │   │
+│  └─────────────────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+### Key Insight
+
+**Don't reinvent parallelization.** Use official `run_azure.py` for compute orchestration, but:
+1. Replace their Docker image with ours (`waa-auto:latest`)
+2. Add our API agent to the agent options
+
+---
+
+## Implementation Plan
+
+### Phase 1: Validate Single Worker (DONE)
+
+- [x] Dedicated VM working (`waa-eval-vm`)
+- [x] VNC access for debugging
+- [x] WAA server auto-starts
+- [x] Benchmark runs with Navi agent
+
+### Phase 2: Add API Agent to Official WAA
+
+**Goal:** Run `python run_azure.py --agent api-openai`
+
+**Steps:**
+
+1. **Create NaviAgent-compatible wrapper:**
+   ```python
+   # mm_agents/openadapt/agent.py
+   class OpenAdaptAgent:
+       """Wrapper to use our ApiAgent with official WAA runner."""
+
+       def __init__(self, model="gpt-4o", provider="openai"):
+           self.provider = provider
+           self.model = model
+           # Initialize API client
+
+       def predict(self, instruction: str, obs: Dict) -> List[str]:
+           """Convert observation → API call → action code."""
+           # 1. Extract screenshot from obs
+           # 2. Call OpenAI/Claude API
+           # 3. Parse response to action
+           # 4. Return as Python code string
+           return [f"computer.mouse.click(x={x}, y={y})"]
+
+       def reset(self):
+           self.history = []
+   ```
+
+2. **Modify official `run.py` to support new agent:**
+   ```python
+   # In run.py, add:
+   elif cfg_args["agent_name"] == "api-openai":
+       from mm_agents.openadapt.agent import OpenAdaptAgent
+       agent = OpenAdaptAgent(provider="openai", model=cfg_args["model"])
+   elif cfg_args["agent_name"] == "api-claude":
+       from mm_agents.openadapt.agent import OpenAdaptAgent
+       agent = OpenAdaptAgent(provider="anthropic", model=cfg_args["model"])
+   ```
+
+3. **Test locally first:**
+   ```bash
+   # On dedicated VM
+   cd /client
+   python run.py --agent api-openai --model gpt-4o --test_all_meta_path ...
+   ```
+
+### Phase 3: Push Custom Image to Azure
+
+**Goal:** Azure ML uses our `waa-auto:latest` instead of `windowsarena/winarena:latest`
+
+**Steps:**
+
+1. **Push to Azure Container Registry:**
+   ```bash
+   # Build locally
+   docker build -t waa-auto:latest -f waa_deploy/Dockerfile .
+
+   # Tag for ACR
+   docker tag waa-auto:latest openadaptacr.azurecr.io/waa-auto:latest
+
+   # Push
+   az acr login --name openadaptacr
+   docker push openadaptacr.azurecr.io/waa-auto:latest
+   ```
+
+2. **Modify `run_azure.py` to use our image:**
+   ```python
+   # Change default:
+   parser.add_argument('--docker_img_name',
+       default='openadaptacr.azurecr.io/waa-auto:latest',  # Was: windowsarena/winarena:latest
+       help='Docker image name')
+   ```
+
+### Phase 4: Wrapper CLI
+
+**Goal:** Single command for everything
+
+```bash
+# Full benchmark with 10 workers
+uv run python -m openadapt_ml.benchmarks.cli scale \
+    --workers 10 \
+    --agent api-openai \
+    --model gpt-4o \
+    --tasks all
+
+# Subset for testing
+uv run python -m openadapt_ml.benchmarks.cli scale \
+    --workers 2 \
+    --agent api-claude \
+    --tasks notepad_1,notepad_2,browser_1
+```
+
+**Implementation:**
+```python
+# In cli.py, add 'scale' command that:
+# 1. Ensures Azure ML workspace exists
+# 2. Ensures our image is in ACR
+# 3. Calls run_azure.py with appropriate args
+# 4. Monitors progress
+# 5. Downloads results when done
+```
+
+---
+
+## File Changes Required
+
+| File | Change | Effort |
+|------|--------|--------|
+| `mm_agents/openadapt/agent.py` | NEW: NaviAgent-compatible wrapper | ~100 lines |
+| `run.py` | MODIFY: Add api-openai/api-claude agent options | ~10 lines |
+| `waa_deploy/Dockerfile` | EXISTING: Already has api_agent.py | Done |
+| `cli.py` | ADD: `scale` command | ~200 lines |
+| `run_azure.py` | MODIFY: Default to our Docker image | ~5 lines |
+
+---
+
+## Prerequisites
+
+### Azure Setup (One-time)
+
+1. **Azure ML Workspace** (if not exists)
+   ```bash
+   az ml workspace create -n openadapt-ml -g openadapt-agents
+   ```
+
+2. **Azure Container Registry**
+   ```bash
+   az acr create -n openadaptacr -g openadapt-agents --sku Basic
+   ```
+
+3. **vCPU Quota** (request increase)
+   - Standard_D8_v3: 8 vCPUs per worker
+   - 10 workers = 80 vCPUs needed
+   - Request via Azure Portal → Quotas
+
+4. **Upload startup script** to Azure ML Notebooks
+   - Path: `Users/<user>/compute-instance-startup.sh`
+   - Content: From `scripts/azure_files/compute-instance-startup.sh`
+
+### Environment Variables
+
+```bash
+# .env file
+AZURE_SUBSCRIPTION_ID=...
+AZURE_ML_RESOURCE_GROUP=openadapt-agents
+AZURE_ML_WORKSPACE_NAME=openadapt-ml
+OPENAI_API_KEY=sk-...
+ANTHROPIC_API_KEY=sk-ant-...
+```
+
+---
+
+## Cost Estimate
+
+| Workers | VM Size | Time for 154 tasks | Compute Cost | API Cost (GPT-4o) | Total |
+|---------|---------|-------------------|--------------|-------------------|-------|
+| 1 | D8_v3 | ~5 hours | ~$2.50 | ~$5 | ~$7.50 |
+| 5 | D8_v3 | ~1 hour | ~$2.50 | ~$5 | ~$7.50 |
+| 10 | D8_v3 | ~30 min | ~$2.50 | ~$5 | ~$7.50 |
+
+**Note:** More workers = faster, same total cost (compute + API calls are constant).
+
+---
+
+## Summary
+
+| Aspect | Approach |
+|--------|----------|
+| **Parallelization** | Use official `run_azure.py` (Azure ML Compute) |
+| **Docker Image** | Our `waa-auto:latest` (auto-download Windows, API agents) |
+| **Agent** | Our `OpenAdaptAgent` wrapper (uses Claude/OpenAI) |
+| **CLI** | Wrapper command `cli.py scale` |
+| **Development** | Dedicated VM with VNC for debugging |
+
+**Total new code:** ~300 lines
+**Reused from official WAA:** Parallelization, compute management, task distribution
+**Reused from our work:** Dockerfile, api_agent.py, WAALiveAdapter concepts
+
+---
+
+## Next Steps
+
+1. [ ] Create `mm_agents/openadapt/agent.py` wrapper (~100 lines)
+2. [ ] Test on dedicated VM with `--agent api-openai`
+3. [ ] Push `waa-auto:latest` to Azure Container Registry
+4. [ ] Modify `run_azure.py` to use our image
+5. [ ] Add `scale` command to CLI
+6. [ ] Request vCPU quota increase (80+ for 10 workers)
+7. [ ] Run full 154-task benchmark
diff --git a/openadapt_ml/benchmarks/waa_deploy/Dockerfile b/openadapt_ml/benchmarks/waa_deploy/Dockerfile
index 02d0817..b5078f8 100644
--- a/openadapt_ml/benchmarks/waa_deploy/Dockerfile
+++ b/openadapt_ml/benchmarks/waa_deploy/Dockerfile
@@ -83,16 +83,15 @@ RUN find /client -name "*.py" -exec sed -i 's|20.20.20.21|172.30.0.2|g' {} \; &&
     echo "Patched client Python files"
 
 # -----------------------------------------------------------------------------
-# Add API-backed agent support (Claude Sonnet 4.5 / GPT-5.1)
-# This allows using --agent api-claude or --agent api-openai instead of navi
+# Add API-backed agent support (Claude / OpenAI)
+# NOTE: API agents (api-claude, api-openai) are run EXTERNALLY via openadapt-evals CLI
+# which connects to the WAA server over SSH tunnel. No internal patching needed.
+# The api_agent.py is included for reference/future use.
 # -----------------------------------------------------------------------------
 
-# Copy api_agent.py to the client mm_agents directory
+# Copy api_agent.py for reference (used externally by openadapt-evals)
 COPY api_agent.py /client/mm_agents/api_agent.py
 
-# Note: API agent patching (api-claude, api-openai) skipped for now
-# The navi agent works out of the box - API agents can be added later
-
 # -----------------------------------------------------------------------------
 # Fix Windows setup for automation
 # -----------------------------------------------------------------------------

From 0fe26aa68dcfc637f60dbbaadf9c6e3a638100d4 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 28 Jan 2026 23:51:58 -0500
Subject: [PATCH 18/21] style: fix ruff formatting

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 openadapt_ml/benchmarks/agent.py | 81 +++++++++++++++++++++++++-------
 openadapt_ml/benchmarks/cli.py   | 24 ++++++++--
 2 files changed, 83 insertions(+), 22 deletions(-)

diff --git a/openadapt_ml/benchmarks/agent.py b/openadapt_ml/benchmarks/agent.py
index 9d4b027..c97d63f 100644
--- a/openadapt_ml/benchmarks/agent.py
+++ b/openadapt_ml/benchmarks/agent.py
@@ -454,7 +454,9 @@ def _parse_response(
         )
         if click_match:
             node_id = click_match.group(1)
-            return BenchmarkAction(type="click", target_node_id=node_id, raw_action=raw_action)
+            return BenchmarkAction(
+                type="click", target_node_id=node_id, raw_action=raw_action
+            )
 
         # Parse CLICK(x, y)
         click_coords = re.match(
@@ -475,7 +477,9 @@ def _parse_response(
             r"TYPE\s*\(\s*[\"'](.+?)[\"']\s*\)", action_line, re.IGNORECASE
         )
         if type_match:
-            return BenchmarkAction(type="type", text=type_match.group(1), raw_action=raw_action)
+            return BenchmarkAction(
+                type="type", text=type_match.group(1), raw_action=raw_action
+            )
 
         # Parse KEY
         key_match = re.match(r"KEY\s*\(\s*(.+?)\s*\)", action_line, re.IGNORECASE)
@@ -483,7 +487,12 @@ def _parse_response(
             key_str = key_match.group(1)
             if "+" in key_str:
                 parts = key_str.split("+")
-                return BenchmarkAction(type="key", key=parts[-1], modifiers=parts[:-1], raw_action=raw_action)
+                return BenchmarkAction(
+                    type="key",
+                    key=parts[-1],
+                    modifiers=parts[:-1],
+                    raw_action=raw_action,
+                )
             return BenchmarkAction(type="key", key=key_str, raw_action=raw_action)
 
         # Parse SCROLL
@@ -491,22 +500,43 @@ def _parse_response(
             r"SCROLL\s*\(\s*(up|down)\s*\)", action_line, re.IGNORECASE
         )
         if scroll_match:
-            return BenchmarkAction(type="scroll", scroll_direction=scroll_match.group(1).lower(), raw_action=raw_action)
+            return BenchmarkAction(
+                type="scroll",
+                scroll_direction=scroll_match.group(1).lower(),
+                raw_action=raw_action,
+            )
 
         # Parse DRAG
         drag_match = re.match(
             r"DRAG\s*\(\s*([\d.]+)\s*,\s*([\d.]+)\s*,\s*([\d.]+)\s*,\s*([\d.]+)\s*\)",
-            action_line, re.IGNORECASE,
+            action_line,
+            re.IGNORECASE,
         )
         if drag_match:
             x, y = float(drag_match.group(1)), float(drag_match.group(2))
             end_x, end_y = float(drag_match.group(3)), float(drag_match.group(4))
-            if observation and observation.viewport and (x > 1.0 or y > 1.0 or end_x > 1.0 or end_y > 1.0):
+            if (
+                observation
+                and observation.viewport
+                and (x > 1.0 or y > 1.0 or end_x > 1.0 or end_y > 1.0)
+            ):
                 width, height = observation.viewport
-                raw_action["original_coords"] = {"x": x, "y": y, "end_x": end_x, "end_y": end_y}
+                raw_action["original_coords"] = {
+                    "x": x,
+                    "y": y,
+                    "end_x": end_x,
+                    "end_y": end_y,
+                }
                 raw_action["normalized"] = True
-                x, y, end_x, end_y = x/width, y/height, end_x/width, end_y/height
-            return BenchmarkAction(type="drag", x=x, y=y, end_x=end_x, end_y=end_y, raw_action=raw_action)
+                x, y, end_x, end_y = (
+                    x / width,
+                    y / height,
+                    end_x / width,
+                    end_y / height,
+                )
+            return BenchmarkAction(
+                type="drag", x=x, y=y, end_x=end_x, end_y=end_y, raw_action=raw_action
+            )
 
         # Parse DONE
         if re.match(r"DONE\s*\(\s*\)", action_line, re.IGNORECASE):
@@ -517,7 +547,9 @@ def _parse_response(
             r"ANSWER\s*\(\s*[\"'](.+?)[\"']\s*\)", action_line, re.IGNORECASE
         )
         if answer_match:
-            return BenchmarkAction(type="answer", answer=answer_match.group(1), raw_action=raw_action)
+            return BenchmarkAction(
+                type="answer", answer=answer_match.group(1), raw_action=raw_action
+            )
 
         raw_action["parse_error"] = f"Unknown action format: {action_line}"
         return BenchmarkAction(type="done", raw_action=raw_action)
@@ -604,11 +636,15 @@ def act(
                 if self.verbose:
                     print(f"[UnifiedBaselineAgent] Failed to load screenshot: {e}")
 
-        a11y_tree = observation.accessibility_tree if observation.accessibility_tree else None
+        a11y_tree = (
+            observation.accessibility_tree if observation.accessibility_tree else None
+        )
 
         adapter_history = None
         if history:
-            adapter_history = [self._benchmark_action_to_dict(a) for _, a in history[-5:]]
+            adapter_history = [
+                self._benchmark_action_to_dict(a) for _, a in history[-5:]
+            ]
 
         try:
             parsed_action = adapter.predict(
@@ -659,7 +695,9 @@ def _parsed_to_benchmark_action(
         if action_type == "click":
             if parsed_action.element_id is not None:
                 return BenchmarkAction(
-                    type="click", target_node_id=str(parsed_action.element_id), raw_action=raw_action
+                    type="click",
+                    target_node_id=str(parsed_action.element_id),
+                    raw_action=raw_action,
                 )
             elif parsed_action.x is not None and parsed_action.y is not None:
                 x, y = parsed_action.x, parsed_action.y
@@ -670,13 +708,21 @@ def _parsed_to_benchmark_action(
                 return BenchmarkAction(type="click", x=x, y=y, raw_action=raw_action)
 
         elif action_type == "type":
-            return BenchmarkAction(type="type", text=parsed_action.text, raw_action=raw_action)
+            return BenchmarkAction(
+                type="type", text=parsed_action.text, raw_action=raw_action
+            )
 
         elif action_type == "key":
-            return BenchmarkAction(type="key", key=parsed_action.key, raw_action=raw_action)
+            return BenchmarkAction(
+                type="key", key=parsed_action.key, raw_action=raw_action
+            )
 
         elif action_type == "scroll":
-            return BenchmarkAction(type="scroll", scroll_direction=parsed_action.direction, raw_action=raw_action)
+            return BenchmarkAction(
+                type="scroll",
+                scroll_direction=parsed_action.direction,
+                raw_action=raw_action,
+            )
 
         elif action_type == "done":
             return BenchmarkAction(type="done", raw_action=raw_action)
@@ -684,7 +730,8 @@ def _parsed_to_benchmark_action(
         elif action_type == "drag":
             return BenchmarkAction(
                 type="drag",
-                x=parsed_action.x, y=parsed_action.y,
+                x=parsed_action.x,
+                y=parsed_action.y,
                 end_x=getattr(parsed_action, "end_x", None),
                 end_y=getattr(parsed_action, "end_y", None),
                 raw_action=raw_action,
diff --git a/openadapt_ml/benchmarks/cli.py b/openadapt_ml/benchmarks/cli.py
index a4a51c4..90cb1ae 100644
--- a/openadapt_ml/benchmarks/cli.py
+++ b/openadapt_ml/benchmarks/cli.py
@@ -348,7 +348,10 @@ def cmd_create(args):
     if use_fast:
         # Try multiple fast sizes with fallbacks
         sizes_to_try = VM_SIZE_FAST_FALLBACKS
-        log("CREATE", f"Creating VM '{VM_NAME}' with --fast (trying multiple D8 sizes)...")
+        log(
+            "CREATE",
+            f"Creating VM '{VM_NAME}' with --fast (trying multiple D8 sizes)...",
+        )
     else:
         # Standard mode: single size
         sizes_to_try = [(VM_SIZE_STANDARD, 0.19)]
@@ -410,7 +413,10 @@ def cmd_create(args):
             log("CREATE", "Tried sizes: " + ", ".join(s[0] for s in sizes_to_try))
         return 1
 
-    log("CREATE", f"Successfully created {successful_size} (${successful_cost:.2f}/hr) in {region}")
+    log(
+        "CREATE",
+        f"Successfully created {successful_size} (${successful_cost:.2f}/hr) in {region}",
+    )
 
     # Wait for SSH
     log("CREATE", "Waiting for SSH...")
@@ -705,7 +711,10 @@ def cmd_start(args):
     if getattr(args, "fast", False):
         ram_size = "16G"
         cpu_cores = 6
-        log("START", "Starting container with VERSION=11e (FAST mode: 6 cores, 16GB RAM)...")
+        log(
+            "START",
+            "Starting container with VERSION=11e (FAST mode: 6 cores, 16GB RAM)...",
+        )
     else:
         ram_size = "8G"
         cpu_cores = 4
@@ -738,7 +747,10 @@ def cmd_start(args):
         log("START", "Auto-launching VNC viewer...")
         tunnel_proc = setup_vnc_tunnel_and_browser(ip)
         if tunnel_proc:
-            log("START", f"VNC auto-launched at http://localhost:8006 (tunnel PID: {tunnel_proc.pid})")
+            log(
+                "START",
+                f"VNC auto-launched at http://localhost:8006 (tunnel PID: {tunnel_proc.pid})",
+            )
         else:
             log("START", "WARNING: VNC tunnel failed to start")
             log("START", f"Manual VNC: ssh -L 8006:localhost:8006 azureuser@{ip}")
@@ -1476,7 +1488,9 @@ def _show_benchmark_progress(ip: str) -> int:
     Parses the run log to count completed tasks and estimate remaining time.
     """
     # Find the most recent run log
-    result = ssh_run(ip, "ls -t /home/azureuser/cli_logs/run_*.log 2>/dev/null | head -1")
+    result = ssh_run(
+        ip, "ls -t /home/azureuser/cli_logs/run_*.log 2>/dev/null | head -1"
+    )
     log_file = result.stdout.strip()
 
     if not log_file:

From 4336e81139afc8eba6119d4d4020a977ca89456a Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Thu, 29 Jan 2026 00:26:05 -0500
Subject: [PATCH 19/21] fix(imports): update internal code to import from
 openadapt-evals

Replace imports from deleted benchmark files with direct imports
from openadapt-evals:

- azure.py: BenchmarkResult, BenchmarkTask, WAAAdapter
- waa_demo/runner.py: BenchmarkAction, WAAMockAdapter, etc.

This completes the migration to the two-package architecture where
openadapt-evals is the canonical source for benchmark infrastructure.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 openadapt_ml/benchmarks/azure.py            |  4 ++--
 openadapt_ml/experiments/waa_demo/runner.py | 11 ++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/openadapt_ml/benchmarks/azure.py b/openadapt_ml/benchmarks/azure.py
index 157c62b..d2588a3 100644
--- a/openadapt_ml/benchmarks/azure.py
+++ b/openadapt_ml/benchmarks/azure.py
@@ -36,8 +36,8 @@
 from pathlib import Path
 from typing import Callable
 
+from openadapt_evals import BenchmarkResult, BenchmarkTask
 from openadapt_ml.benchmarks.agent import BenchmarkAgent
-from openadapt_ml.benchmarks.base import BenchmarkResult, BenchmarkTask
 
 logger = logging.getLogger(__name__)
 
@@ -496,7 +496,7 @@ def run_evaluation(
             List of BenchmarkResult for all tasks.
         """
         # Load tasks
-        from openadapt_ml.benchmarks.waa import WAAAdapter
+        from openadapt_evals import WAAMockAdapter as WAAAdapter
 
         adapter = WAAAdapter(waa_repo_path=self.waa_repo_path)
         if task_ids:
diff --git a/openadapt_ml/experiments/waa_demo/runner.py b/openadapt_ml/experiments/waa_demo/runner.py
index 77fa630..86bb999 100644
--- a/openadapt_ml/experiments/waa_demo/runner.py
+++ b/openadapt_ml/experiments/waa_demo/runner.py
@@ -38,7 +38,7 @@
 )
 
 if TYPE_CHECKING:
-    from openadapt_ml.benchmarks.base import (
+    from openadapt_evals import (
         BenchmarkAction,
         BenchmarkObservation,
         BenchmarkTask,
@@ -267,7 +267,7 @@ def act(
         Returns:
             BenchmarkAction parsed from VLM response
         """
-        from openadapt_ml.benchmarks.base import BenchmarkAction
+        from openadapt_evals import BenchmarkAction
 
         adapter = self._get_adapter()
 
@@ -409,7 +409,7 @@ def _parse_response(
         Uses the same parsing logic as APIBenchmarkAgent.
         """
         import re
-        from openadapt_ml.benchmarks.base import BenchmarkAction
+        from openadapt_evals import BenchmarkAction
 
         raw_action = {"response": response}
 
@@ -512,12 +512,9 @@ def cmd_run(args: argparse.Namespace) -> int:
     This integrates with the benchmarks infrastructure to run either
     zero-shot or demo-conditioned evaluation on WAA tasks.
     """
-    from openadapt_ml.benchmarks import (
+    from openadapt_evals import (
         WAAMockAdapter,
         compute_metrics,
-    )
-    from openadapt_ml.benchmarks.runner import (
-        EvaluationConfig,
         evaluate_agent_on_benchmark,
     )
 

From 20e9078ddcafbd731146bbded248502e9647a210 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Thu, 29 Jan 2026 00:49:17 -0500
Subject: [PATCH 20/21] fix(imports): add missing EvaluationConfig import

- Update azure.py to import BenchmarkAgent from openadapt_evals
- Add EvaluationConfig to runner.py imports

Fixes CI failure: F821 Undefined name `EvaluationConfig`

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 openadapt_ml/benchmarks/azure.py            | 3 +--
 openadapt_ml/experiments/waa_demo/runner.py | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/openadapt_ml/benchmarks/azure.py b/openadapt_ml/benchmarks/azure.py
index d2588a3..f18f59c 100644
--- a/openadapt_ml/benchmarks/azure.py
+++ b/openadapt_ml/benchmarks/azure.py
@@ -36,8 +36,7 @@
 from pathlib import Path
 from typing import Callable
 
-from openadapt_evals import BenchmarkResult, BenchmarkTask
-from openadapt_ml.benchmarks.agent import BenchmarkAgent
+from openadapt_evals import BenchmarkAgent, BenchmarkResult, BenchmarkTask
 
 logger = logging.getLogger(__name__)
 
diff --git a/openadapt_ml/experiments/waa_demo/runner.py b/openadapt_ml/experiments/waa_demo/runner.py
index 86bb999..73c826d 100644
--- a/openadapt_ml/experiments/waa_demo/runner.py
+++ b/openadapt_ml/experiments/waa_demo/runner.py
@@ -513,6 +513,7 @@ def cmd_run(args: argparse.Namespace) -> int:
     zero-shot or demo-conditioned evaluation on WAA tasks.
     """
     from openadapt_evals import (
+        EvaluationConfig,
         WAAMockAdapter,
         compute_metrics,
         evaluate_agent_on_benchmark,

From 0c0ce72ad802eefce0a1b7777d8190c85673cd9b Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Thu, 29 Jan 2026 01:00:50 -0500
Subject: [PATCH 21/21] fix(deps): require openadapt-evals>=0.1.1

v0.1.0 uses task ID format "browser_1" but tests expect "mock_browser_001"
which was added in v0.1.1.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2f907fd..4910c68 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,7 +70,7 @@ dev = [
 ]
 # Benchmark evaluation (depends on openadapt-evals)
 benchmarks = [
-    "openadapt-evals>=0.1.0",
+    "openadapt-evals>=0.1.1",
 ]
 
 [project.urls]