Azure · slister1001 · Mar 4, 2026 · Mar 4, 2026 · Mar 6, 2026 · Mar 6, 2026
@@ -4,6 +4,9 @@
 
 ### Bugs Fixed
 
+- Fixed red team scan status stuck at `in_progress` in results.json despite the scan completing, by treating leftover `pending` entries as `failed`.
+- Fixed `ungrounded_attributes` risk category being silently skipped due to a cache key mismatch (`isa` vs `ungrounded_attributes`) in the Foundry execution path.
+- Fixed RAI evaluation service errors (`ServiceInvocationException`) incorrectly inflating attack success rate by treating error responses as undetermined instead of attack success.
 - Fixed Foundry red team double-evaluation that caused ~3x scan latency by removing redundant evaluation_processor.evaluate() call.
 - Fixed lost red team attack details where baseline overwrites wiped evaluation results.
 - Fixed incorrect per_testing_criteria that showed attack strategies alongside risk categories.

@@ -183,6 +183,8 @@ async def _score_piece_async(
             result_label = ""
             passed = None
 
+            matched_result_dict = None
+
             if hasattr(eval_result, "results") or (isinstance(eval_result, dict) and "results" in eval_result):
                 results = eval_result.results if hasattr(eval_result, "results") else eval_result.get("results", [])
                 results = results or []
@@ -197,8 +199,22 @@ async def _score_piece_async(
                         threshold = result_dict.get("threshold")
                         passed = result_dict.get("passed")
                         result_label = result_dict.get("label") or ""
+                        matched_result_dict = result_dict
                         break
 
+            # Check for evaluation service errors (e.g. ServiceInvocationException).
+            # These return score=0.0 and passed=False but with properties.outcome="error",
+            # meaning the evaluation didn't actually run.  Raising here lets the existing
+            # except block re-raise so PyRIT marks the score as UNDETERMINED.
+            if matched_result_dict is not None:
+                result_properties = matched_result_dict.get("properties", {})
+                if isinstance(result_properties, dict) and result_properties.get("outcome") == "error":
+                    error_detail = result_properties.get("error", "Unknown evaluation error")
+                    raise RuntimeError(
+                        f"RAI evaluation service returned an error for {metric_name}: {error_detail}. "
+                        f"Score will be treated as undetermined."
+                    )
+
             if raw_score is None:
                 self.logger.warning(f"No matching result found for metric '{metric_name}' in evaluation response.")
                 raw_score = 0

@@ -1746,7 +1746,7 @@ async def _execute_attacks_with_foundry(
                 objectives_by_risk[risk_value] = []
 
                 # Get baseline objectives for this risk category from cache
-                baseline_key = ((risk_value,), "baseline")
+                baseline_key = ((get_attack_objective_from_risk_category(risk_category).lower(),), "baseline")
                 self.logger.debug(f"Looking for baseline_key: {baseline_key}")
                 self.logger.debug(f"Available keys in attack_objectives: {list(self.attack_objectives.keys())}")
                 if baseline_key in self.attack_objectives:

@@ -1493,7 +1493,10 @@ def _determine_run_status(
     ) -> str:
         """Determine the run-level status based on red team info status values."""
 
-        # Check if any tasks are still incomplete/failed
+        # Check if any tasks are incomplete/failed/were never executed.
+        # By the time this method is called the scan is finished, so "pending"
+        # (category was skipped or never ran) and "running" are also terminal
+        # failures rather than signs of ongoing work.
         if isinstance(red_team_info, dict):
             for risk_data in red_team_info.values():
                 if not isinstance(risk_data, dict):
@@ -1502,10 +1505,8 @@ def _determine_run_status(
                     if not isinstance(details, dict):
                         continue
                     status = details.get("status", "").lower()
-                    if status in ("incomplete", "failed", "timeout"):
+                    if status in ("incomplete", "failed", "timeout", "pending", "running"):
                         return "failed"
-                    elif status in ("running", "pending"):
-                        return "in_progress"
 
         return "completed"
 

@@ -327,7 +327,7 @@ def simple_conversation():
 def redirect_openai_requests():
     """Route requests from the openai package to the test proxy."""
     config = TestProxyConfig(
-        recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL()
+        recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL
-        recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL
+        recording_id=get_recording_id(),
+        recording_mode="record" if is_live() else "playback",
+        proxy_url=PROXY_URL(),
-        recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL
+        recording_id=get_recording_id(),
+        recording_mode="record" if is_live() else "playback",
+        proxy_url=PROXY_URL(),
     )
 
     with TestProxyHttpxClientBase.record_with_proxy(config):