diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 33f802373..65f8c1530 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -108,6 +108,79 @@ def should_wrap_instance(self, instance: EvalInstance) -> bool: def get_source_repo_path(self, instance: EvalInstance) -> str: return "/testbed" + def get_repo_path(self, instance: EvalInstance) -> str: + return f"/workspace/{instance.data['repo'].split('/')[-1]}/" + + def get_git_patch( + self, + instance: EvalInstance, + workspace: RemoteWorkspace, + ) -> str: + repo_path = self.get_repo_path(instance) + base_commit = instance.data["base_commit"] + git_patch_result = workspace.execute_command( + (f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD") + ) + assert git_patch_result.exit_code == 0, ( + f"git diff failed: {git_patch_result.stderr}" + ) + return git_patch_result.stdout + + def get_staged_git_patch( + self, + instance: EvalInstance, + workspace: RemoteWorkspace, + ) -> str: + repo_path = self.get_repo_path(instance) + base_commit = instance.data["base_commit"] + git_patch_result = workspace.execute_command( + (f"cd {repo_path} ; git --no-pager diff --no-color --cached {base_commit}") + ) + assert git_patch_result.exit_code == 0, ( + f"git diff failed: {git_patch_result.stderr}" + ) + return git_patch_result.stdout + + def collect_failure_test_result( + self, + instance: EvalInstance, + workspace: RemoteWorkspace, + error: Exception, + ) -> dict[str, Any]: + repo_path = self.get_repo_path(instance) + + # Include newly-created files in the recovered diff. This intentionally + # avoids committing so the failed workspace remains close to its final + # agent-visible state. + git_add = workspace.execute_command(f"cd {repo_path} ; git add -A") + if git_add.exit_code != 0: + logger.warning( + "Failed to stage changes while collecting failure patch for %s: %s", + instance.id, + git_add.stderr, + ) + + try: + git_patch = self.get_staged_git_patch(instance, workspace) + except Exception as patch_error: + logger.warning( + "Failed to collect failure patch for %s after %s: %s", + instance.id, + type(error).__name__, + patch_error, + ) + return {} + + logger.info( + "Collected failure patch for %s: %d chars", + instance.id, + len(git_patch), + ) + return { + "git_patch": git_patch, + "git_patch_captured_on_error": True, + } + def prepare_instances(self) -> List[EvalInstance]: logger.info("Setting up SWE-bench evaluation data") @@ -294,7 +367,7 @@ def evaluate_instance( setup_acp_workspace(self.metadata.agent_type, workspace) - repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/" + repo_path = self.get_repo_path(instance) instance.data["repo_path"] = repo_path persist_callback = build_event_persistence_callback( @@ -347,14 +420,7 @@ def evaluate_instance( ) # Get git patch - base_commit = instance.data["base_commit"] - git_patch_result = workspace.execute_command( - (f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD") - ) - assert git_patch_result.exit_code == 0, ( - f"git diff failed: {git_patch_result.stderr}" - ) - git_patch = git_patch_result.stdout + git_patch = self.get_git_patch(instance, workspace) # Log instance summary summarize_instance( diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index e9adc9249..60857a354 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -340,9 +340,10 @@ def _create_error_output( retry_count: int, metrics: Metrics | None = None, proxy_cost: float | None = None, + test_result: dict[str, Any] | None = None, ) -> EvalOutput: """Create an EvalOutput object for a failed instance.""" - test_result: dict[str, Any] = {} + test_result = dict(test_result or {}) if proxy_cost is not None: test_result["proxy_cost"] = proxy_cost @@ -358,6 +359,20 @@ def _create_error_output( instance=_to_serializable(instance.data), ) + def collect_failure_test_result( + self, + instance: EvalInstance, + workspace: RemoteWorkspace, + error: Exception, + ) -> dict[str, Any]: + """Collect benchmark-specific result artifacts after a failed run. + + Called while the workspace is still alive, before cleanup. Benchmarks + that can recover partial outputs, such as a git diff, should override + this hook and return fields to include in the error row's test_result. + """ + return {} + def _capture_conversation_archive( self, workspace: RemoteWorkspace, @@ -1160,12 +1175,28 @@ def _execute_single_attempt( exc_info=True, ) if workspace is not None: + try: + failure_test_result = self.collect_failure_test_result( + instance, + workspace, + e, + ) + except Exception as capture_error: + logger.warning( + "[worker] Failed to collect failure test_result for %s: %s", + instance.id, + capture_error, + ) + failure_test_result = {} + # Capture the archive before teardown so failure telemetry # survives even when cleanup later skips duplicate capture. conversation_archive_path = self._capture_conversation_archive( workspace, instance, ) + else: + failure_test_result = {} recovered_metrics = self._load_metrics_from_conversation_archive( conversation_archive_path ) @@ -1176,6 +1207,7 @@ def _execute_single_attempt( max_retries, metrics=recovered_metrics, proxy_cost=proxy_cost, + test_result=failure_test_result, ) if runtime_runs: error_output.runtime_runs = runtime_runs diff --git a/tests/test_swebench_failure_patch_capture.py b/tests/test_swebench_failure_patch_capture.py new file mode 100644 index 000000000..5a734ca11 --- /dev/null +++ b/tests/test_swebench_failure_patch_capture.py @@ -0,0 +1,94 @@ +from types import SimpleNamespace +from typing import cast + +from benchmarks.swebench.run_infer import SWEBenchEvaluation +from benchmarks.utils.critics import PassCritic +from benchmarks.utils.evaluation import Evaluation +from benchmarks.utils.models import EvalInstance, EvalMetadata +from openhands.sdk import LLM +from openhands.sdk.workspace import RemoteWorkspace + + +class DummyEvaluation(Evaluation): + def prepare_instances(self): + return [] + + def prepare_workspace( + self, + instance, + resource_factor=1, + forward_env=None, + ): + raise NotImplementedError + + def evaluate_instance(self, instance, workspace): + raise NotImplementedError + + +class FakeWorkspace: + def __init__(self): + self.commands = [] + + def execute_command(self, command, *args, **kwargs): + self.commands.append(command) + if "git --no-pager diff --no-color --cached abc123" in command: + return SimpleNamespace( + exit_code=0, + stdout="diff --git a/file.py b/file.py\n", + stderr="", + ) + if "git --no-pager diff --no-color abc123 HEAD" in command: + return SimpleNamespace(exit_code=0, stdout="", stderr="") + return SimpleNamespace(exit_code=0, stdout="", stderr="") + + +def _metadata(tmp_path): + return EvalMetadata( + llm=LLM(model="test-model", api_key="test-key"), + dataset="test-dataset", + max_iterations=1, + eval_output_dir=str(tmp_path), + critic=PassCritic(), + ) + + +def test_error_output_preserves_failure_test_result(tmp_path): + evaluation = DummyEvaluation(metadata=_metadata(tmp_path)) + instance = EvalInstance(id="instance-1", data={}) + + output = evaluation._create_error_output( + instance, + RuntimeError("boom"), + retry_count=0, + test_result={"git_patch": "diff --git a/file.py b/file.py\n"}, + ) + + assert output.error is not None + assert output.test_result["git_patch"].startswith("diff --git") + + +def test_swebench_collect_failure_test_result_gets_git_patch(tmp_path): + evaluation = SWEBenchEvaluation(metadata=_metadata(tmp_path)) + instance = EvalInstance( + id="django__django-12345", + data={ + "repo": "django/django", + "base_commit": "abc123", + }, + ) + workspace = FakeWorkspace() + + result = evaluation.collect_failure_test_result( + instance, + cast(RemoteWorkspace, workspace), + RuntimeError("Remote conversation got stuck"), + ) + + assert result == { + "git_patch": "diff --git a/file.py b/file.py\n", + "git_patch_captured_on_error": True, + } + assert workspace.commands == [ + "cd /workspace/django/ ; git add -A", + "cd /workspace/django/ ; git --no-pager diff --no-color --cached abc123", + ]