OpenAdaptAI · abrichr · Jan 29, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
diff --git a/README.md b/README.md
@@ -422,6 +422,46 @@ See [LIVE_MONITORING.md](./LIVE_MONITORING.md) for full documentation.
 - [CLAUDE.md](./CLAUDE.md) - Development guide and best practices
 - [CHANGELOG.md](./CHANGELOG.md) - Version history and changes
 
+## WAA Benchmark Results
+
+> **⚠️ PLACEHOLDER**: The results below are placeholders. Actual benchmark results will be added once the full evaluation completes.
+
+### Baseline Reproduction
+
+We run the full WAA benchmark using the same methodology as the original paper to establish baseline performance.
+
+**WAA Baseline Results (GPT-4o):**
+
+| Metric | Paper Reported | Our Reproduction | Status |
+|--------|----------------|------------------|--------|
+| Success Rate | ~19.5% | `[PLACEHOLDER]` | `[PENDING]` |
+| Tasks Evaluated | 154 | `[PLACEHOLDER]` | `[PENDING]` |
+| Avg Steps/Task | N/A | `[PLACEHOLDER]` | `[PENDING]` |
+| Avg Time/Task | N/A | `[PLACEHOLDER]` | `[PENDING]` |
+
+### Model Comparison
+
+Performance of different agents on WAA:
+
+| Agent | Success Rate | Avg Steps | Notes |
+|-------|--------------|-----------|-------|
+| GPT-4o (baseline) | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
+| Claude Sonnet 4.5 | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
+
+### Domain Breakdown
+
+Success rates by Windows application domain:
+
+| Domain | Tasks | Success Rate |
+|--------|-------|--------------|
+| Notepad | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| Chrome | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| File Explorer | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| Settings | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
+| ... | ... | ... |
+
+> **Note**: Full domain breakdown will be added when benchmark completes.
+
 ## License
 
 MIT

diff --git a/openadapt_evals/adapters/__init__.py b/openadapt_evals/adapters/__init__.py
@@ -34,8 +34,16 @@
     StaticDatasetAdapter,
     UIElement,
 )
-from openadapt_evals.adapters.waa import WAAAdapter, WAAConfig, WAAMockAdapter
-from openadapt_evals.adapters.waa_live import WAALiveAdapter, WAALiveConfig
+from openadapt_evals.adapters.waa import (
+    WAAAdapter,
+    WAAConfig,
+    WAAMockAdapter,
+    WAALiveAdapter,
+    WAALiveConfig,
+    SyntheticTaskError,
+    is_real_waa_task_id,
+    is_synthetic_task_id,
+)
 
 __all__ = [
     # Base classes
@@ -52,4 +60,8 @@
     "WAAMockAdapter",
     "WAALiveAdapter",
     "WAALiveConfig",
+    # Task ID validation
+    "SyntheticTaskError",
+    "is_real_waa_task_id",
+    "is_synthetic_task_id",
 ]
diff --git a/openadapt_evals/adapters/waa/__init__.py b/openadapt_evals/adapters/waa/__init__.py
@@ -0,0 +1,51 @@
+"""Windows Agent Arena (WAA) adapters.
+
+This module provides adapters for the Windows Agent Arena benchmark:
+- WAAAdapter: Full WAA integration (requires WAA repo)
+- WAAMockAdapter: Mock adapter for testing (no Windows required)
+- WAALiveAdapter: HTTP adapter for remote WAA server
+
+Example:
+    ```python
+    from openadapt_evals.adapters.waa import WAAMockAdapter, WAALiveAdapter
+
+    # For local testing (no Windows VM)
+    adapter = WAAMockAdapter(num_tasks=10)
+
+    # For remote evaluation
+    adapter = WAALiveAdapter(server_url="http://vm-ip:5000")
+    ```
+"""
+
+from openadapt_evals.adapters.waa.mock import (
+    WAAAdapter,
+    WAAConfig,
+    WAAMockAdapter,
+    WAA_DOMAINS,
+)
+from openadapt_evals.adapters.waa.live import (
+    WAALiveAdapter,
+    WAALiveConfig,
+    SyntheticTaskError,
+    is_real_waa_task_id,
+    is_synthetic_task_id,
+    WAA_TASK_ID_PATTERN,
+    SYNTHETIC_TASK_PATTERNS,
+)
+
+__all__ = [
+    # Mock/full adapters
+    "WAAAdapter",
+    "WAAConfig",
+    "WAAMockAdapter",
+    "WAA_DOMAINS",
+    # Live adapter
+    "WAALiveAdapter",
+    "WAALiveConfig",
+    "WAA_TASK_ID_PATTERN",
+    "SYNTHETIC_TASK_PATTERNS",
+    # Task ID validation
+    "SyntheticTaskError",
+    "is_real_waa_task_id",
+    "is_synthetic_task_id",
+]
diff --git a/openadapt_evals/adapters/waa_live.py → openadapt_evals/adapters/waa/live.py b/openadapt_evals/adapters/waa_live.py → openadapt_evals/adapters/waa/live.py
@@ -15,7 +15,7 @@
     not pixel coordinates. WAA's Computer class handles the grounding.
 
 Example:
-    from openadapt_evals.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
+    from openadapt_evals.adapters.waa import WAALiveAdapter, WAALiveConfig
 
     adapter = WAALiveAdapter(WAALiveConfig(server_url="http://vm-ip:5000"))
     agent = DemoConditionedAgent(base_agent, retriever)
@@ -26,6 +26,7 @@
 
 import base64
 import logging
+import re
 import time
 from dataclasses import dataclass
 from typing import Any
@@ -41,6 +42,70 @@
 logger = logging.getLogger(__name__)
 
 
+# WAA task IDs are UUIDs with a domain suffix, e.g., "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx-WOS"
+# Common suffixes: WOS (Windows OS), CHR (Chrome), NTP (Notepad), etc.
+WAA_TASK_ID_PATTERN = re.compile(
+    r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}(-[A-Za-z0-9]+)?$'
+)
+
+# Synthetic task ID patterns (from mock adapter or testing)
+SYNTHETIC_TASK_PATTERNS = [
+    re.compile(r'^(mock_)?[a-z_]+_\d+$'),  # notepad_1, mock_chrome_001
+    re.compile(r'^mock_'),  # any mock_ prefix
+]
+
+
+def is_real_waa_task_id(task_id: str) -> bool:
+    """Check if a task ID matches the real WAA UUID format.
+
+    Real WAA task IDs are UUIDs from test_small.json or test_all.json, e.g.:
+    - "a1b2c3d4-e5f6-7890-abcd-ef1234567890-WOS"
+    - "12345678-1234-1234-1234-123456789012-CHR"
+
+    Synthetic task IDs are simple patterns like:
+    - "notepad_1", "chrome_2" (from mock adapter)
+    - "mock_notepad_001" (explicit mock prefix)
+
+    Args:
+        task_id: Task identifier to check.
+
+    Returns:
+        True if the task ID appears to be a real WAA UUID.
+    """
+    return bool(WAA_TASK_ID_PATTERN.match(task_id))
+
+
+def is_synthetic_task_id(task_id: str) -> bool:
+    """Check if a task ID appears to be synthetic (for testing).
+
+    Args:
+        task_id: Task identifier to check.
+
+    Returns:
+        True if the task ID matches synthetic patterns.
+    """
+    for pattern in SYNTHETIC_TASK_PATTERNS:
+        if pattern.match(task_id):
+            return True
+    return False
+
+
+class SyntheticTaskError(ValueError):
+    """Raised when a synthetic task ID is used with the live adapter."""
+
+    def __init__(self, task_id: str):
+        self.task_id = task_id
+        super().__init__(
+            f"Task ID '{task_id}' appears to be synthetic (for testing). "
+            f"The live adapter requires real WAA task IDs (UUIDs from test_small.json or test_all.json). "
+            f"\n\nTo fix this:"
+            f"\n  1. Use --mock flag for testing without a Windows VM"
+            f"\n  2. Or provide real WAA task IDs with --task-ids"
+            f"\n  3. Or use --tasks N to select N random real tasks"
+            f"\n\nExample real task ID: 'a1b2c3d4-e5f6-7890-abcd-ef1234567890-WOS'"
+        )
+
+
 @dataclass
 class WAALiveConfig:
     """Configuration for WAALiveAdapter.
@@ -139,11 +204,20 @@ def load_task(self, task_id: str) -> BenchmarkTask:
         3. Creates minimal task as fallback
 
         Args:
-            task_id: Task identifier (e.g., "notepad_1", "browser_abc123").
+            task_id: Task identifier. Must be a real WAA UUID
+                (e.g., "a1b2c3d4-e5f6-7890-abcd-ef1234567890-WOS").
 
         Returns:
             BenchmarkTask object with evaluator config if available.
+
+        Raises:
+            SyntheticTaskError: If task_id appears to be synthetic (e.g., "notepad_1").
+                Use WAAMockAdapter for synthetic/testing tasks.
         """
+        # Validate that this is a real WAA task ID, not a synthetic one
+        if is_synthetic_task_id(task_id):
+            raise SyntheticTaskError(task_id)
+
         import requests
 
         # Try to load from server first
@@ -447,46 +521,45 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
             return self._evaluate_fallback(task)
 
     def _evaluate_fallback(self, task: BenchmarkTask) -> BenchmarkResult:
-        """Fallback evaluation when /evaluate endpoint is unavailable.
+        """Fallback when proper evaluation unavailable - returns failure.
 
-        Uses a simple heuristic based on:
-        - Whether the agent took any actions
-        - Whether the agent called DONE
-        - Whether the task has success criteria we can check locally
+        This method explicitly fails instead of providing fake heuristic scores.
+        Proper evaluation requires either:
+        1. WAA server with /evaluate endpoint deployed
+        2. Task configs with evaluator specs (set waa_examples_path)
+        3. Real WAA task IDs (UUIDs from test_small.json/test_all.json)
 
         Args:
             task: Task to evaluate.
 
         Returns:
-            BenchmarkResult with heuristic-based score.
+            BenchmarkResult with success=False and score=0.0.
         """
-        has_actions = len(self._actions) > 0
-        called_done = any(a.type == "done" for a in self._actions)
-        typed_text = any(a.type == "type" and a.text for a in self._actions)
-
-        # Calculate heuristic score
-        score = 0.0
-        if has_actions:
-            score += 0.2
-        if called_done:
-            score += 0.2
-        if typed_text:
-            score += 0.1
-        if self._step_count >= 2:
-            score += 0.1
-
-        # Cap at 0.5 since we can't truly verify success
-        score = min(score, 0.5)
+        # Check if task has evaluator config
+        has_evaluator = bool(
+            task.raw_config and task.raw_config.get("evaluator")
+        )
+
+        if has_evaluator:
+            reason = (
+                "Evaluation unavailable: WAA /evaluate endpoint not deployed. "
+                "Task has evaluator config but server cannot run it."
+            )
+        else:
+            reason = (
+                "Evaluation unavailable: task config missing evaluator spec. "
+                "Set waa_examples_path in config or use real WAA task IDs "
+                "(UUIDs from test_small.json/test_all.json, not synthetic IDs like 'notepad_1')."
+            )
+
+        logger.error(reason)
 
         return BenchmarkResult(
             task_id=task.task_id,
-            success=False,  # Can't determine without proper evaluation
-            score=score,
+            success=False,
+            score=0.0,
             num_steps=self._step_count,
-            reason=(
-                "Fallback evaluation (WAA /evaluate endpoint unavailable). "
-                f"Heuristic: actions={len(self._actions)}, done={called_done}, typed={typed_text}"
-            ),
+            reason=reason,
         )
 
     def close(self) -> None:

diff --git a/openadapt_evals/adapters/waa.py → openadapt_evals/adapters/waa/mock.py b/openadapt_evals/adapters/waa.py → openadapt_evals/adapters/waa/mock.py
@@ -544,14 +544,24 @@ def _to_waa_action(self, action: BenchmarkAction) -> dict:
 class WAAMockAdapter(BenchmarkAdapter):
     """Mock WAA adapter for testing without Windows VM.
 
+    This adapter generates synthetic tasks for testing the benchmark infrastructure
+    without requiring a Windows VM or WAA server. Task IDs are prefixed with "mock_"
+    to clearly distinguish them from real WAA task IDs.
+
     Useful for:
     - Testing the benchmark integration without actual WAA
     - Development on non-Windows platforms
     - Unit tests
+    - Verifying agent behavior before running real evaluations
 
     Args:
         num_tasks: Number of mock tasks to generate.
         domains: Domains to include in mock tasks.
+
+    Note:
+        Mock task IDs use the format "mock_{domain}_{number}" (e.g., "mock_notepad_001").
+        These IDs are explicitly rejected by WAALiveAdapter to prevent confusion
+        between testing and real evaluation runs.
     """
 
     def __init__(
@@ -578,21 +588,27 @@ def benchmark_type(self) -> str:
         return "interactive"
 
     def _generate_mock_tasks(self) -> None:
-        """Generate mock tasks for testing."""
+        """Generate mock tasks for testing.
+
+        Task IDs use the format "mock_{domain}_{number}" (e.g., "mock_notepad_001")
+        to clearly distinguish them from real WAA UUIDs. This prevents accidental
+        use of synthetic tasks with the live adapter.
+        """
         tasks_per_domain = self._num_tasks // len(self._domains)
         extra = self._num_tasks % len(self._domains)
 
         for i, domain in enumerate(self._domains):
             count = tasks_per_domain + (1 if i < extra else 0)
             for j in range(count):
-                task_id = f"{domain}_{j + 1}"
+                # Use mock_ prefix to clearly indicate synthetic task
+                task_id = f"mock_{domain}_{j + 1:03d}"
                 self._tasks.append(
                     BenchmarkTask(
                         task_id=task_id,
                         instruction=f"Mock task {j + 1} in {domain} domain",
                         domain=domain,
                         time_limit_steps=15,
-                        raw_config={"mock": True},
+                        raw_config={"mock": True, "synthetic": True},
                     )
                 )