Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .beads/issues.jsonl

Large diffs are not rendered by default.

40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,46 @@ See [LIVE_MONITORING.md](./LIVE_MONITORING.md) for full documentation.
- [CLAUDE.md](./CLAUDE.md) - Development guide and best practices
- [CHANGELOG.md](./CHANGELOG.md) - Version history and changes

## WAA Benchmark Results

> **⚠️ PLACEHOLDER**: The results below are placeholders. Actual benchmark results will be added once the full evaluation completes.

### Baseline Reproduction

We run the full WAA benchmark using the same methodology as the original paper to establish baseline performance.

**WAA Baseline Results (GPT-4o):**

| Metric | Paper Reported | Our Reproduction | Status |
|--------|----------------|------------------|--------|
| Success Rate | ~19.5% | `[PLACEHOLDER]` | `[PENDING]` |
| Tasks Evaluated | 154 | `[PLACEHOLDER]` | `[PENDING]` |
| Avg Steps/Task | N/A | `[PLACEHOLDER]` | `[PENDING]` |
| Avg Time/Task | N/A | `[PLACEHOLDER]` | `[PENDING]` |

### Model Comparison

Performance of different agents on WAA:

| Agent | Success Rate | Avg Steps | Notes |
|-------|--------------|-----------|-------|
| GPT-4o (baseline) | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |
| Claude Sonnet 4.5 | `[PLACEHOLDER]` | `[PLACEHOLDER]` | Zero-shot |

### Domain Breakdown

Success rates by Windows application domain:

| Domain | Tasks | Success Rate |
|--------|-------|--------------|
| Notepad | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
| Chrome | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
| File Explorer | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
| Settings | `[PLACEHOLDER]` | `[PLACEHOLDER]` |
| ... | ... | ... |

> **Note**: Full domain breakdown will be added when benchmark completes.

## License

MIT
Expand Down
16 changes: 14 additions & 2 deletions openadapt_evals/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,16 @@
StaticDatasetAdapter,
UIElement,
)
from openadapt_evals.adapters.waa import WAAAdapter, WAAConfig, WAAMockAdapter
from openadapt_evals.adapters.waa_live import WAALiveAdapter, WAALiveConfig
from openadapt_evals.adapters.waa import (
WAAAdapter,
WAAConfig,
WAAMockAdapter,
WAALiveAdapter,
WAALiveConfig,
SyntheticTaskError,
is_real_waa_task_id,
is_synthetic_task_id,
)

__all__ = [
# Base classes
Expand All @@ -52,4 +60,8 @@
"WAAMockAdapter",
"WAALiveAdapter",
"WAALiveConfig",
# Task ID validation
"SyntheticTaskError",
"is_real_waa_task_id",
"is_synthetic_task_id",
]
51 changes: 51 additions & 0 deletions openadapt_evals/adapters/waa/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Windows Agent Arena (WAA) adapters.

This module provides adapters for the Windows Agent Arena benchmark:
- WAAAdapter: Full WAA integration (requires WAA repo)
- WAAMockAdapter: Mock adapter for testing (no Windows required)
- WAALiveAdapter: HTTP adapter for remote WAA server

Example:
```python
from openadapt_evals.adapters.waa import WAAMockAdapter, WAALiveAdapter

# For local testing (no Windows VM)
adapter = WAAMockAdapter(num_tasks=10)

# For remote evaluation
adapter = WAALiveAdapter(server_url="http://vm-ip:5000")
```
"""

from openadapt_evals.adapters.waa.mock import (
WAAAdapter,
WAAConfig,
WAAMockAdapter,
WAA_DOMAINS,
)
from openadapt_evals.adapters.waa.live import (
WAALiveAdapter,
WAALiveConfig,
SyntheticTaskError,
is_real_waa_task_id,
is_synthetic_task_id,
WAA_TASK_ID_PATTERN,
SYNTHETIC_TASK_PATTERNS,
)

__all__ = [
# Mock/full adapters
"WAAAdapter",
"WAAConfig",
"WAAMockAdapter",
"WAA_DOMAINS",
# Live adapter
"WAALiveAdapter",
"WAALiveConfig",
"WAA_TASK_ID_PATTERN",
"SYNTHETIC_TASK_PATTERNS",
# Task ID validation
"SyntheticTaskError",
"is_real_waa_task_id",
"is_synthetic_task_id",
]
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
not pixel coordinates. WAA's Computer class handles the grounding.

Example:
from openadapt_evals.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
from openadapt_evals.adapters.waa import WAALiveAdapter, WAALiveConfig

adapter = WAALiveAdapter(WAALiveConfig(server_url="http://vm-ip:5000"))
agent = DemoConditionedAgent(base_agent, retriever)
Expand All @@ -26,6 +26,7 @@

import base64
import logging
import re
import time
from dataclasses import dataclass
from typing import Any
Expand All @@ -41,6 +42,70 @@
logger = logging.getLogger(__name__)


# WAA task IDs are UUIDs with a domain suffix, e.g., "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx-WOS"
# Common suffixes: WOS (Windows OS), CHR (Chrome), NTP (Notepad), etc.
WAA_TASK_ID_PATTERN = re.compile(
r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}(-[A-Za-z0-9]+)?$'
)

# Synthetic task ID patterns (from mock adapter or testing)
SYNTHETIC_TASK_PATTERNS = [
re.compile(r'^(mock_)?[a-z_]+_\d+$'), # notepad_1, mock_chrome_001
re.compile(r'^mock_'), # any mock_ prefix
]


def is_real_waa_task_id(task_id: str) -> bool:
"""Check if a task ID matches the real WAA UUID format.

Real WAA task IDs are UUIDs from test_small.json or test_all.json, e.g.:
- "a1b2c3d4-e5f6-7890-abcd-ef1234567890-WOS"
- "12345678-1234-1234-1234-123456789012-CHR"

Synthetic task IDs are simple patterns like:
- "notepad_1", "chrome_2" (from mock adapter)
- "mock_notepad_001" (explicit mock prefix)

Args:
task_id: Task identifier to check.

Returns:
True if the task ID appears to be a real WAA UUID.
"""
return bool(WAA_TASK_ID_PATTERN.match(task_id))


def is_synthetic_task_id(task_id: str) -> bool:
"""Check if a task ID appears to be synthetic (for testing).

Args:
task_id: Task identifier to check.

Returns:
True if the task ID matches synthetic patterns.
"""
for pattern in SYNTHETIC_TASK_PATTERNS:
if pattern.match(task_id):
return True
return False


class SyntheticTaskError(ValueError):
"""Raised when a synthetic task ID is used with the live adapter."""

def __init__(self, task_id: str):
self.task_id = task_id
super().__init__(
f"Task ID '{task_id}' appears to be synthetic (for testing). "
f"The live adapter requires real WAA task IDs (UUIDs from test_small.json or test_all.json). "
f"\n\nTo fix this:"
f"\n 1. Use --mock flag for testing without a Windows VM"
f"\n 2. Or provide real WAA task IDs with --task-ids"
f"\n 3. Or use --tasks N to select N random real tasks"
f"\n\nExample real task ID: 'a1b2c3d4-e5f6-7890-abcd-ef1234567890-WOS'"
)


@dataclass
class WAALiveConfig:
"""Configuration for WAALiveAdapter.
Expand Down Expand Up @@ -139,11 +204,20 @@ def load_task(self, task_id: str) -> BenchmarkTask:
3. Creates minimal task as fallback

Args:
task_id: Task identifier (e.g., "notepad_1", "browser_abc123").
task_id: Task identifier. Must be a real WAA UUID
(e.g., "a1b2c3d4-e5f6-7890-abcd-ef1234567890-WOS").

Returns:
BenchmarkTask object with evaluator config if available.

Raises:
SyntheticTaskError: If task_id appears to be synthetic (e.g., "notepad_1").
Use WAAMockAdapter for synthetic/testing tasks.
"""
# Validate that this is a real WAA task ID, not a synthetic one
if is_synthetic_task_id(task_id):
raise SyntheticTaskError(task_id)

import requests

# Try to load from server first
Expand Down Expand Up @@ -447,46 +521,45 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
return self._evaluate_fallback(task)

def _evaluate_fallback(self, task: BenchmarkTask) -> BenchmarkResult:
"""Fallback evaluation when /evaluate endpoint is unavailable.
"""Fallback when proper evaluation unavailable - returns failure.

Uses a simple heuristic based on:
- Whether the agent took any actions
- Whether the agent called DONE
- Whether the task has success criteria we can check locally
This method explicitly fails instead of providing fake heuristic scores.
Proper evaluation requires either:
1. WAA server with /evaluate endpoint deployed
2. Task configs with evaluator specs (set waa_examples_path)
3. Real WAA task IDs (UUIDs from test_small.json/test_all.json)

Args:
task: Task to evaluate.

Returns:
BenchmarkResult with heuristic-based score.
BenchmarkResult with success=False and score=0.0.
"""
has_actions = len(self._actions) > 0
called_done = any(a.type == "done" for a in self._actions)
typed_text = any(a.type == "type" and a.text for a in self._actions)

# Calculate heuristic score
score = 0.0
if has_actions:
score += 0.2
if called_done:
score += 0.2
if typed_text:
score += 0.1
if self._step_count >= 2:
score += 0.1

# Cap at 0.5 since we can't truly verify success
score = min(score, 0.5)
# Check if task has evaluator config
has_evaluator = bool(
task.raw_config and task.raw_config.get("evaluator")
)

if has_evaluator:
reason = (
"Evaluation unavailable: WAA /evaluate endpoint not deployed. "
"Task has evaluator config but server cannot run it."
)
else:
reason = (
"Evaluation unavailable: task config missing evaluator spec. "
"Set waa_examples_path in config or use real WAA task IDs "
"(UUIDs from test_small.json/test_all.json, not synthetic IDs like 'notepad_1')."
)

logger.error(reason)

return BenchmarkResult(
task_id=task.task_id,
success=False, # Can't determine without proper evaluation
score=score,
success=False,
score=0.0,
num_steps=self._step_count,
reason=(
"Fallback evaluation (WAA /evaluate endpoint unavailable). "
f"Heuristic: actions={len(self._actions)}, done={called_done}, typed={typed_text}"
),
reason=reason,
)

def close(self) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -544,14 +544,24 @@ def _to_waa_action(self, action: BenchmarkAction) -> dict:
class WAAMockAdapter(BenchmarkAdapter):
"""Mock WAA adapter for testing without Windows VM.

This adapter generates synthetic tasks for testing the benchmark infrastructure
without requiring a Windows VM or WAA server. Task IDs are prefixed with "mock_"
to clearly distinguish them from real WAA task IDs.

Useful for:
- Testing the benchmark integration without actual WAA
- Development on non-Windows platforms
- Unit tests
- Verifying agent behavior before running real evaluations

Args:
num_tasks: Number of mock tasks to generate.
domains: Domains to include in mock tasks.

Note:
Mock task IDs use the format "mock_{domain}_{number}" (e.g., "mock_notepad_001").
These IDs are explicitly rejected by WAALiveAdapter to prevent confusion
between testing and real evaluation runs.
"""

def __init__(
Expand All @@ -578,21 +588,27 @@ def benchmark_type(self) -> str:
return "interactive"

def _generate_mock_tasks(self) -> None:
"""Generate mock tasks for testing."""
"""Generate mock tasks for testing.

Task IDs use the format "mock_{domain}_{number}" (e.g., "mock_notepad_001")
to clearly distinguish them from real WAA UUIDs. This prevents accidental
use of synthetic tasks with the live adapter.
"""
tasks_per_domain = self._num_tasks // len(self._domains)
extra = self._num_tasks % len(self._domains)

for i, domain in enumerate(self._domains):
count = tasks_per_domain + (1 if i < extra else 0)
for j in range(count):
task_id = f"{domain}_{j + 1}"
# Use mock_ prefix to clearly indicate synthetic task
task_id = f"mock_{domain}_{j + 1:03d}"
self._tasks.append(
BenchmarkTask(
task_id=task_id,
instruction=f"Mock task {j + 1} in {domain} domain",
domain=domain,
time_limit_steps=15,
raw_config={"mock": True},
raw_config={"mock": True, "synthetic": True},
)
)

Expand Down
Loading