Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 75 additions & 9 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,79 @@ def should_wrap_instance(self, instance: EvalInstance) -> bool:
def get_source_repo_path(self, instance: EvalInstance) -> str:
return "/testbed"

def get_repo_path(self, instance: EvalInstance) -> str:
return f"/workspace/{instance.data['repo'].split('/')[-1]}/"

def get_git_patch(
self,
instance: EvalInstance,
workspace: RemoteWorkspace,
) -> str:
repo_path = self.get_repo_path(instance)
base_commit = instance.data["base_commit"]
git_patch_result = workspace.execute_command(
(f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD")
)
assert git_patch_result.exit_code == 0, (
f"git diff failed: {git_patch_result.stderr}"
)
return git_patch_result.stdout

def get_staged_git_patch(
self,
instance: EvalInstance,
workspace: RemoteWorkspace,
) -> str:
repo_path = self.get_repo_path(instance)
base_commit = instance.data["base_commit"]
git_patch_result = workspace.execute_command(
(f"cd {repo_path} ; git --no-pager diff --no-color --cached {base_commit}")
)
assert git_patch_result.exit_code == 0, (
f"git diff failed: {git_patch_result.stderr}"
)
return git_patch_result.stdout

def collect_failure_test_result(
self,
instance: EvalInstance,
workspace: RemoteWorkspace,
error: Exception,
) -> dict[str, Any]:
repo_path = self.get_repo_path(instance)

# Include newly-created files in the recovered diff. This intentionally
# avoids committing so the failed workspace remains close to its final
# agent-visible state.
git_add = workspace.execute_command(f"cd {repo_path} ; git add -A")
if git_add.exit_code != 0:
logger.warning(
"Failed to stage changes while collecting failure patch for %s: %s",
instance.id,
git_add.stderr,
)

try:
git_patch = self.get_staged_git_patch(instance, workspace)
except Exception as patch_error:
logger.warning(
"Failed to collect failure patch for %s after %s: %s",
instance.id,
type(error).__name__,
patch_error,
)
return {}

logger.info(
"Collected failure patch for %s: %d chars",
instance.id,
len(git_patch),
)
return {
"git_patch": git_patch,
"git_patch_captured_on_error": True,
}

def prepare_instances(self) -> List[EvalInstance]:
logger.info("Setting up SWE-bench evaluation data")

Expand Down Expand Up @@ -294,7 +367,7 @@ def evaluate_instance(

setup_acp_workspace(self.metadata.agent_type, workspace)

repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
repo_path = self.get_repo_path(instance)
instance.data["repo_path"] = repo_path

persist_callback = build_event_persistence_callback(
Expand Down Expand Up @@ -347,14 +420,7 @@ def evaluate_instance(
)

# Get git patch
base_commit = instance.data["base_commit"]
git_patch_result = workspace.execute_command(
(f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD")
)
assert git_patch_result.exit_code == 0, (
f"git diff failed: {git_patch_result.stderr}"
)
git_patch = git_patch_result.stdout
git_patch = self.get_git_patch(instance, workspace)

# Log instance summary
summarize_instance(
Expand Down
34 changes: 33 additions & 1 deletion benchmarks/utils/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,9 +340,10 @@ def _create_error_output(
retry_count: int,
metrics: Metrics | None = None,
proxy_cost: float | None = None,
test_result: dict[str, Any] | None = None,
) -> EvalOutput:
"""Create an EvalOutput object for a failed instance."""
test_result: dict[str, Any] = {}
test_result = dict(test_result or {})
if proxy_cost is not None:
test_result["proxy_cost"] = proxy_cost

Expand All @@ -358,6 +359,20 @@ def _create_error_output(
instance=_to_serializable(instance.data),
)

def collect_failure_test_result(
self,
instance: EvalInstance,
workspace: RemoteWorkspace,
error: Exception,
) -> dict[str, Any]:
"""Collect benchmark-specific result artifacts after a failed run.

Called while the workspace is still alive, before cleanup. Benchmarks
that can recover partial outputs, such as a git diff, should override
this hook and return fields to include in the error row's test_result.
"""
return {}

def _capture_conversation_archive(
self,
workspace: RemoteWorkspace,
Expand Down Expand Up @@ -1160,12 +1175,28 @@ def _execute_single_attempt(
exc_info=True,
)
if workspace is not None:
try:
failure_test_result = self.collect_failure_test_result(
instance,
workspace,
e,
)
except Exception as capture_error:
logger.warning(
"[worker] Failed to collect failure test_result for %s: %s",
instance.id,
capture_error,
)
failure_test_result = {}

# Capture the archive before teardown so failure telemetry
# survives even when cleanup later skips duplicate capture.
conversation_archive_path = self._capture_conversation_archive(
workspace,
instance,
)
else:
failure_test_result = {}
recovered_metrics = self._load_metrics_from_conversation_archive(
conversation_archive_path
)
Expand All @@ -1176,6 +1207,7 @@ def _execute_single_attempt(
max_retries,
metrics=recovered_metrics,
proxy_cost=proxy_cost,
test_result=failure_test_result,
)
if runtime_runs:
error_output.runtime_runs = runtime_runs
Expand Down
94 changes: 94 additions & 0 deletions tests/test_swebench_failure_patch_capture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from types import SimpleNamespace
from typing import cast

from benchmarks.swebench.run_infer import SWEBenchEvaluation
from benchmarks.utils.critics import PassCritic
from benchmarks.utils.evaluation import Evaluation
from benchmarks.utils.models import EvalInstance, EvalMetadata
from openhands.sdk import LLM
from openhands.sdk.workspace import RemoteWorkspace


class DummyEvaluation(Evaluation):
def prepare_instances(self):
return []

def prepare_workspace(
self,
instance,
resource_factor=1,
forward_env=None,
):
raise NotImplementedError

def evaluate_instance(self, instance, workspace):
raise NotImplementedError


class FakeWorkspace:
def __init__(self):
self.commands = []

def execute_command(self, command, *args, **kwargs):
self.commands.append(command)
if "git --no-pager diff --no-color --cached abc123" in command:
return SimpleNamespace(
exit_code=0,
stdout="diff --git a/file.py b/file.py\n",
stderr="",
)
if "git --no-pager diff --no-color abc123 HEAD" in command:
return SimpleNamespace(exit_code=0, stdout="", stderr="")
return SimpleNamespace(exit_code=0, stdout="", stderr="")


def _metadata(tmp_path):
return EvalMetadata(
llm=LLM(model="test-model", api_key="test-key"),
dataset="test-dataset",
max_iterations=1,
eval_output_dir=str(tmp_path),
critic=PassCritic(),
)


def test_error_output_preserves_failure_test_result(tmp_path):
evaluation = DummyEvaluation(metadata=_metadata(tmp_path))
instance = EvalInstance(id="instance-1", data={})

output = evaluation._create_error_output(
instance,
RuntimeError("boom"),
retry_count=0,
test_result={"git_patch": "diff --git a/file.py b/file.py\n"},
)

assert output.error is not None
assert output.test_result["git_patch"].startswith("diff --git")


def test_swebench_collect_failure_test_result_gets_git_patch(tmp_path):
evaluation = SWEBenchEvaluation(metadata=_metadata(tmp_path))
instance = EvalInstance(
id="django__django-12345",
data={
"repo": "django/django",
"base_commit": "abc123",
},
)
workspace = FakeWorkspace()

result = evaluation.collect_failure_test_result(
instance,
cast(RemoteWorkspace, workspace),
RuntimeError("Remote conversation got stuck"),
)

assert result == {
"git_patch": "diff --git a/file.py b/file.py\n",
"git_patch_captured_on_error": True,
}
assert workspace.commands == [
"cd /workspace/django/ ; git add -A",
"cd /workspace/django/ ; git --no-pager diff --no-color --cached abc123",
]
Loading