OpenHands · neubig · Jun 10, 2026 · Jun 10, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
@@ -108,6 +108,79 @@ def should_wrap_instance(self, instance: EvalInstance) -> bool:
     def get_source_repo_path(self, instance: EvalInstance) -> str:
         return "/testbed"
 
+    def get_repo_path(self, instance: EvalInstance) -> str:
+        return f"/workspace/{instance.data['repo'].split('/')[-1]}/"
+
+    def get_git_patch(
+        self,
+        instance: EvalInstance,
+        workspace: RemoteWorkspace,
+    ) -> str:
+        repo_path = self.get_repo_path(instance)
+        base_commit = instance.data["base_commit"]
+        git_patch_result = workspace.execute_command(
+            (f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD")
+        )
+        assert git_patch_result.exit_code == 0, (
+            f"git diff failed: {git_patch_result.stderr}"
+        )
+        return git_patch_result.stdout
+
+    def get_staged_git_patch(
+        self,
+        instance: EvalInstance,
+        workspace: RemoteWorkspace,
+    ) -> str:
+        repo_path = self.get_repo_path(instance)
+        base_commit = instance.data["base_commit"]
+        git_patch_result = workspace.execute_command(
+            (f"cd {repo_path} ; git --no-pager diff --no-color --cached {base_commit}")
+        )
+        assert git_patch_result.exit_code == 0, (
+            f"git diff failed: {git_patch_result.stderr}"
+        )
+        return git_patch_result.stdout
+
+    def collect_failure_test_result(
+        self,
+        instance: EvalInstance,
+        workspace: RemoteWorkspace,
+        error: Exception,
+    ) -> dict[str, Any]:
+        repo_path = self.get_repo_path(instance)
+
+        # Include newly-created files in the recovered diff. This intentionally
+        # avoids committing so the failed workspace remains close to its final
+        # agent-visible state.
+        git_add = workspace.execute_command(f"cd {repo_path} ; git add -A")
+        if git_add.exit_code != 0:
+            logger.warning(
+                "Failed to stage changes while collecting failure patch for %s: %s",
+                instance.id,
+                git_add.stderr,
+            )
+
+        try:
+            git_patch = self.get_staged_git_patch(instance, workspace)
+        except Exception as patch_error:
+            logger.warning(
+                "Failed to collect failure patch for %s after %s: %s",
+                instance.id,
+                type(error).__name__,
+                patch_error,
+            )
+            return {}
+
+        logger.info(
+            "Collected failure patch for %s: %d chars",
+            instance.id,
+            len(git_patch),
+        )
+        return {
+            "git_patch": git_patch,
+            "git_patch_captured_on_error": True,
+        }
+
     def prepare_instances(self) -> List[EvalInstance]:
         logger.info("Setting up SWE-bench evaluation data")
 
@@ -294,7 +367,7 @@ def evaluate_instance(
 
         setup_acp_workspace(self.metadata.agent_type, workspace)
 
-        repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
+        repo_path = self.get_repo_path(instance)
         instance.data["repo_path"] = repo_path
 
         persist_callback = build_event_persistence_callback(
@@ -347,14 +420,7 @@ def evaluate_instance(
         )
 
         # Get git patch
-        base_commit = instance.data["base_commit"]
-        git_patch_result = workspace.execute_command(
-            (f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD")
-        )
-        assert git_patch_result.exit_code == 0, (
-            f"git diff failed: {git_patch_result.stderr}"
-        )
-        git_patch = git_patch_result.stdout
+        git_patch = self.get_git_patch(instance, workspace)
 
         # Log instance summary
         summarize_instance(

diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py
@@ -340,9 +340,10 @@ def _create_error_output(
         retry_count: int,
         metrics: Metrics | None = None,
         proxy_cost: float | None = None,
+        test_result: dict[str, Any] | None = None,
     ) -> EvalOutput:
         """Create an EvalOutput object for a failed instance."""
-        test_result: dict[str, Any] = {}
+        test_result = dict(test_result or {})
         if proxy_cost is not None:
             test_result["proxy_cost"] = proxy_cost
 
@@ -358,6 +359,20 @@ def _create_error_output(
             instance=_to_serializable(instance.data),
         )
 
+    def collect_failure_test_result(
+        self,
+        instance: EvalInstance,
+        workspace: RemoteWorkspace,
+        error: Exception,
+    ) -> dict[str, Any]:
+        """Collect benchmark-specific result artifacts after a failed run.
+
+        Called while the workspace is still alive, before cleanup. Benchmarks
+        that can recover partial outputs, such as a git diff, should override
+        this hook and return fields to include in the error row's test_result.
+        """
+        return {}
+
     def _capture_conversation_archive(
         self,
         workspace: RemoteWorkspace,
@@ -1160,12 +1175,28 @@ def _execute_single_attempt(
                     exc_info=True,
                 )
                 if workspace is not None:
+                    try:
+                        failure_test_result = self.collect_failure_test_result(
+                            instance,
+                            workspace,
+                            e,
+                        )
+                    except Exception as capture_error:
+                        logger.warning(
+                            "[worker] Failed to collect failure test_result for %s: %s",
+                            instance.id,
+                            capture_error,
+                        )
+                        failure_test_result = {}
+
                     # Capture the archive before teardown so failure telemetry
                     # survives even when cleanup later skips duplicate capture.
                     conversation_archive_path = self._capture_conversation_archive(
                         workspace,
                         instance,
                     )
+                else:
+                    failure_test_result = {}
                 recovered_metrics = self._load_metrics_from_conversation_archive(
                     conversation_archive_path
                 )
@@ -1176,6 +1207,7 @@ def _execute_single_attempt(
                     max_retries,
                     metrics=recovered_metrics,
                     proxy_cost=proxy_cost,
+                    test_result=failure_test_result,
                 )
                 if runtime_runs:
                     error_output.runtime_runs = runtime_runs

diff --git a/tests/test_swebench_failure_patch_capture.py b/tests/test_swebench_failure_patch_capture.py
@@ -0,0 +1,94 @@
+from types import SimpleNamespace
+from typing import cast
+
+from benchmarks.swebench.run_infer import SWEBenchEvaluation
+from benchmarks.utils.critics import PassCritic
+from benchmarks.utils.evaluation import Evaluation
+from benchmarks.utils.models import EvalInstance, EvalMetadata
+from openhands.sdk import LLM
+from openhands.sdk.workspace import RemoteWorkspace
+
+
+class DummyEvaluation(Evaluation):
+    def prepare_instances(self):
+        return []
+
+    def prepare_workspace(
+        self,
+        instance,
+        resource_factor=1,
+        forward_env=None,
+    ):
+        raise NotImplementedError
+
+    def evaluate_instance(self, instance, workspace):
+        raise NotImplementedError
+
+
+class FakeWorkspace:
+    def __init__(self):
+        self.commands = []
+
+    def execute_command(self, command, *args, **kwargs):
+        self.commands.append(command)
+        if "git --no-pager diff --no-color --cached abc123" in command:
+            return SimpleNamespace(
+                exit_code=0,
+                stdout="diff --git a/file.py b/file.py\n",
+                stderr="",
+            )
+        if "git --no-pager diff --no-color abc123 HEAD" in command:
+            return SimpleNamespace(exit_code=0, stdout="", stderr="")
+        return SimpleNamespace(exit_code=0, stdout="", stderr="")
+
+
+def _metadata(tmp_path):
+    return EvalMetadata(
+        llm=LLM(model="test-model", api_key="test-key"),
+        dataset="test-dataset",
+        max_iterations=1,
+        eval_output_dir=str(tmp_path),
+        critic=PassCritic(),
+    )
+
+
+def test_error_output_preserves_failure_test_result(tmp_path):
+    evaluation = DummyEvaluation(metadata=_metadata(tmp_path))
+    instance = EvalInstance(id="instance-1", data={})
+
+    output = evaluation._create_error_output(
+        instance,
+        RuntimeError("boom"),
+        retry_count=0,
+        test_result={"git_patch": "diff --git a/file.py b/file.py\n"},
+    )
+
+    assert output.error is not None
+    assert output.test_result["git_patch"].startswith("diff --git")
+
+
+def test_swebench_collect_failure_test_result_gets_git_patch(tmp_path):
+    evaluation = SWEBenchEvaluation(metadata=_metadata(tmp_path))
+    instance = EvalInstance(
+        id="django__django-12345",
+        data={
+            "repo": "django/django",
+            "base_commit": "abc123",
+        },
+    )
+    workspace = FakeWorkspace()
+
+    result = evaluation.collect_failure_test_result(
+        instance,
+        cast(RemoteWorkspace, workspace),
+        RuntimeError("Remote conversation got stuck"),
+    )
+
+    assert result == {
+        "git_patch": "diff --git a/file.py b/file.py\n",
+        "git_patch_captured_on_error": True,
+    }
+    assert workspace.commands == [
+        "cd /workspace/django/ ; git add -A",
+        "cd /workspace/django/ ; git --no-pager diff --no-color --cached abc123",
+    ]