Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

### Bugs Fixed

- Fixed red team scan status stuck at `in_progress` in results.json despite the scan completing, by treating leftover `pending` entries as `failed`.
- Fixed `ungrounded_attributes` risk category being silently skipped due to a cache key mismatch (`isa` vs `ungrounded_attributes`) in the Foundry execution path.
- Fixed RAI evaluation service errors (`ServiceInvocationException`) incorrectly inflating attack success rate by treating error responses as undetermined instead of attack success.
Comment on lines +7 to +9
Copy link

Copilot AI Mar 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR description is currently a placeholder and doesn’t explain the changes or link relevant issues. Please replace it with an informative description so reviewers and release tooling can understand intent and scope.

Copilot uses AI. Check for mistakes.
- Fixed Foundry red team double-evaluation that caused ~3x scan latency by removing redundant evaluation_processor.evaluate() call.
- Fixed lost red team attack details where baseline overwrites wiped evaluation results.
- Fixed incorrect per_testing_criteria that showed attack strategies alongside risk categories.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ async def _score_piece_async(
result_label = ""
passed = None

matched_result_dict = None

if hasattr(eval_result, "results") or (isinstance(eval_result, dict) and "results" in eval_result):
results = eval_result.results if hasattr(eval_result, "results") else eval_result.get("results", [])
results = results or []
Expand All @@ -197,8 +199,22 @@ async def _score_piece_async(
threshold = result_dict.get("threshold")
passed = result_dict.get("passed")
result_label = result_dict.get("label") or ""
matched_result_dict = result_dict
break

# Check for evaluation service errors (e.g. ServiceInvocationException).
# These return score=0.0 and passed=False but with properties.outcome="error",
# meaning the evaluation didn't actually run. Raising here lets the existing
# except block re-raise so PyRIT marks the score as UNDETERMINED.
if matched_result_dict is not None:
result_properties = matched_result_dict.get("properties", {})
if isinstance(result_properties, dict) and result_properties.get("outcome") == "error":
error_detail = result_properties.get("error", "Unknown evaluation error")
raise RuntimeError(
f"RAI evaluation service returned an error for {metric_name}: {error_detail}. "
f"Score will be treated as undetermined."
)

if raw_score is None:
self.logger.warning(f"No matching result found for metric '{metric_name}' in evaluation response.")
raw_score = 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1746,7 +1746,7 @@ async def _execute_attacks_with_foundry(
objectives_by_risk[risk_value] = []

# Get baseline objectives for this risk category from cache
baseline_key = ((risk_value,), "baseline")
baseline_key = ((get_attack_objective_from_risk_category(risk_category).lower(),), "baseline")
self.logger.debug(f"Looking for baseline_key: {baseline_key}")
self.logger.debug(f"Available keys in attack_objectives: {list(self.attack_objectives.keys())}")
if baseline_key in self.attack_objectives:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1493,7 +1493,10 @@ def _determine_run_status(
) -> str:
"""Determine the run-level status based on red team info status values."""

# Check if any tasks are still incomplete/failed
# Check if any tasks are incomplete/failed/were never executed.
# By the time this method is called the scan is finished, so "pending"
# (category was skipped or never ran) and "running" are also terminal
# failures rather than signs of ongoing work.
if isinstance(red_team_info, dict):
for risk_data in red_team_info.values():
if not isinstance(risk_data, dict):
Expand All @@ -1502,10 +1505,8 @@ def _determine_run_status(
if not isinstance(details, dict):
continue
status = details.get("status", "").lower()
if status in ("incomplete", "failed", "timeout"):
if status in ("incomplete", "failed", "timeout", "pending", "running"):
return "failed"
elif status in ("running", "pending"):
return "in_progress"

return "completed"

Expand Down
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def simple_conversation():
def redirect_openai_requests():
"""Route requests from the openai package to the test proxy."""
config = TestProxyConfig(
recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL()
recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL
Copy link

Copilot AI Mar 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PROXY_URL is imported from devtools_testutils.config as a function (see eng/tools/azure-sdk-tools/devtools_testutils/config.py:18-20). Passing proxy_url=PROXY_URL will pass the function object instead of the expected URL string, which will break proxy routing. Call it (PROXY_URL()) or import/define a string constant instead.

Suggested change
recording_id=get_recording_id(), recording_mode="record" if is_live() else "playback", proxy_url=PROXY_URL
recording_id=get_recording_id(),
recording_mode="record" if is_live() else "playback",
proxy_url=PROXY_URL(),

Copilot uses AI. Check for mistakes.
)

with TestProxyHttpxClientBase.record_with_proxy(config):
Expand Down
Loading
Loading