diff --git a/docs/decisions/0020-foundry-evals-integration.md b/docs/decisions/0020-foundry-evals-integration.md index f5b5db4db5..ea9d2f3c69 100644 --- a/docs/decisions/0020-foundry-evals-integration.md +++ b/docs/decisions/0020-foundry-evals-integration.md @@ -462,7 +462,7 @@ class FoundryEvals: ### Azure AI: FoundryEvals Constants ```python -from agent_framework_azure_ai import FoundryEvals +from agent_framework.foundry import FoundryEvals evaluators = [FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY] ``` diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py index 0f652f23bd..a9e4245e77 100644 --- a/python/packages/core/agent_framework/__init__.py +++ b/python/packages/core/agent_framework/__init__.py @@ -57,6 +57,27 @@ included_messages, included_token_count, ) +from ._evaluation import ( + AgentEvalConverter, + CheckResult, + ConversationSplit, + ConversationSplitter, + EvalItem, + EvalItemResult, + EvalNotPassedError, + EvalResults, + EvalScoreResult, + Evaluator, + ExpectedToolCall, + LocalEvaluator, + evaluate_agent, + evaluate_workflow, + evaluator, + keyword_check, + tool_call_args_match, + tool_called_check, + tool_calls_present, +) from ._mcp import MCPStdioTool, MCPStreamableHTTPTool, MCPWebsocketTool from ._middleware import ( AgentContext, @@ -242,6 +263,7 @@ "USER_AGENT_TELEMETRY_DISABLED_ENV_VAR", "Agent", "AgentContext", + "AgentEvalConverter", "AgentExecutor", "AgentExecutorRequest", "AgentExecutorResponse", @@ -268,11 +290,14 @@ "ChatOptions", "ChatResponse", "ChatResponseUpdate", + "CheckResult", "CheckpointStorage", "CompactionProvider", "CompactionStrategy", "Content", "ContinuationToken", + "ConversationSplit", + "ConversationSplitter", "Default", "Edge", "EdgeCondition", @@ -281,7 +306,14 @@ "EmbeddingGenerationOptions", "EmbeddingInputT", "EmbeddingT", + "EvalItem", + "EvalItemResult", + "EvalNotPassedError", + "EvalResults", + "EvalScoreResult", + "Evaluator", "Executor", + "ExpectedToolCall", "FanInEdgeGroup", "FanOutEdgeGroup", "FileCheckpointStorage", @@ -300,6 +332,7 @@ "InMemoryCheckpointStorage", "InMemoryHistoryProvider", "InProcRunnerContext", + "LocalEvaluator", "MCPStdioTool", "MCPStreamableHTTPTool", "MCPWebsocketTool", @@ -379,11 +412,15 @@ "chat_middleware", "create_edge_runner", "detect_media_type_from_base64", + "evaluate_agent", + "evaluate_workflow", + "evaluator", "executor", "function_middleware", "handler", "included_messages", "included_token_count", + "keyword_check", "load_settings", "map_chat_to_agent_update", "merge_chat_options", @@ -396,6 +433,9 @@ "resolve_agent_id", "response_handler", "tool", + "tool_call_args_match", + "tool_called_check", + "tool_calls_present", "validate_chat_options", "validate_tool_mode", "validate_tools", diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py new file mode 100644 index 0000000000..b3460040af --- /dev/null +++ b/python/packages/core/agent_framework/_evaluation.py @@ -0,0 +1,1884 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Provider-agnostic evaluation framework for Microsoft Agent Framework. + +Defines the core evaluation types and orchestration functions that work with +any evaluation provider (Azure AI Foundry, local evaluators, third-party +libraries, etc.). Also includes ``LocalEvaluator`` and built-in check +functions for fast, API-free evaluation during inner-loop development and +CI smoke tests. + +Cloud evaluator example: + +.. code-block:: python + + from agent_framework import evaluate_agent, EvalResults + from agent_framework.foundry import FoundryEvals + + evals = FoundryEvals(project_client=client, model="gpt-4o") + results = await evaluate_agent(agent=agent, queries=["Hello"], evaluators=evals) + results.raise_for_status() + +Local evaluator example: + +.. code-block:: python + + from agent_framework import LocalEvaluator, keyword_check, evaluate_agent + + local = LocalEvaluator( + keyword_check("weather", "temperature"), + tool_called_check("get_weather"), + ) + results = await evaluate_agent(agent=agent, queries=queries, evaluators=local) +""" + +from __future__ import annotations + +import asyncio +import contextlib +import inspect +import json +import logging +from collections.abc import Awaitable, Callable, Sequence +from dataclasses import dataclass, field +from enum import Enum +from typing import ( + TYPE_CHECKING, + Any, + Literal, + Protocol, + TypedDict, + Union, + cast, + overload, + runtime_checkable, +) + +from ._tools import FunctionTool +from ._types import AgentResponse, Message + +if TYPE_CHECKING: + from ._agents import SupportsAgentRun + from ._workflows._agent_executor import AgentExecutorResponse + from ._workflows._workflow import Workflow, WorkflowRunResult + +logger = logging.getLogger(__name__) + + +class EvalNotPassedError(Exception): + """Raised when evaluation results contain failures.""" + + +# region Core types + + +class ConversationSplit(str, Enum): + """Built-in strategies for splitting a conversation into query/response halves. + + Different splits evaluate different aspects of agent behavior: + + - ``LAST_TURN``: Split at the last user message. Everything up to and + including that message is the query; everything after is the response. + Evaluates whether the agent answered the *latest* question well. + + - ``FULL``: The first user message (and any preceding system messages) is + the query; the entire remainder of the conversation is the response. + Evaluates whether the *whole conversation trajectory* served the + original request. + + For custom splits (e.g. split before a memory-retrieval tool call), + pass a callable instead — see ``ConversationSplitter``. + """ + + LAST_TURN = "last_turn" + FULL = "full" + + +ConversationSplitter = Union[ + ConversationSplit, + Callable[[list[Message]], tuple[list[Message], list[Message]]], +] +"""Type accepted by ``EvalItem.split_messages(split=...)``. + +Either a built-in ``ConversationSplit`` enum value **or** a callable with +signature: + +.. code-block:: python + + def my_splitter(conversation: list[Message]) -> tuple[list[Message], list[Message]]: + '''Return (query_messages, response_messages).''' + +Custom splitters let you evaluate domain-specific boundaries — for example, +splitting just before a memory-retrieval tool call to evaluate recall quality: + +.. code-block:: python + + def split_before_memory(conversation): + for i, msg in enumerate(conversation): + for c in msg.contents or []: + if c.type == "function_call" and c.name == "retrieve_memory": + return conversation[:i], conversation[i:] + # Fallback: split at last user message + return EvalItem._split_last_turn_static(conversation) + + item.split_messages(split=split_before_memory) +""" + + +@dataclass +class ExpectedToolCall: + """A tool call that an agent is expected to make. + + Used with :func:`evaluate_agent` to assert that the agent called the + correct tools. The *evaluator* decides the matching semantics (order, + extras, argument checking); this type is pure data. + + Attributes: + name: The tool/function name (e.g. ``"get_weather"``). + arguments: Expected arguments. ``None`` means "don't check arguments" or "no arguments". + """ + + name: str + arguments: dict[str, Any] | None = None + + +def _split_last_turn(conversation: list[Message]) -> tuple[list[Message], list[Message]]: + """Split at the last user message (default strategy).""" + last_user_idx = -1 + for i, msg in enumerate(conversation): + if msg.role == "user": + last_user_idx = i + if last_user_idx >= 0: + return conversation[: last_user_idx + 1], conversation[last_user_idx + 1 :] + return [], list(conversation) + + +def _split_full(conversation: list[Message]) -> tuple[list[Message], list[Message]]: + """Split after the first user message (evaluates whole trajectory).""" + for i, msg in enumerate(conversation): + if msg.role == "user": + return conversation[: i + 1], conversation[i + 1 :] + return [], list(conversation) + + +_BUILT_IN_SPLITTERS: dict[ConversationSplit, Callable[[list[Message]], tuple[list[Message], list[Message]]]] = { + ConversationSplit.LAST_TURN: _split_last_turn, + ConversationSplit.FULL: _split_full, +} + + +class EvalItem: + """A single item to be evaluated. + + Represents one query/response interaction in a provider-agnostic format. + ``conversation`` is the single source of truth — ``query`` and ``response`` + are derived from it via the split strategy. + + Attributes: + conversation: Full conversation as ``Message`` objects. + tools: Typed tool objects (e.g. ``FunctionTool``) for evaluator logic. + context: Optional grounding context document. + expected_output: Optional expected output for ground-truth comparison. + expected_tool_calls: Expected tool calls for tool-correctness + evaluation. See :class:`ExpectedToolCall`. + split_strategy: Split strategy controlling how ``query`` and + ``response`` are derived from the conversation. Defaults to + ``ConversationSplit.LAST_TURN``. + """ + + def __init__( + self, + conversation: list[Message], + tools: list[FunctionTool] | None = None, + context: str | None = None, + expected_output: str | None = None, + expected_tool_calls: list[ExpectedToolCall] | None = None, + split_strategy: ConversationSplitter | None = None, + ) -> None: + self.conversation = conversation + self.tools = tools + self.context = context + self.expected_output = expected_output + self.expected_tool_calls = expected_tool_calls + self.split_strategy = split_strategy + + @property + def query(self) -> str: + """User query text, derived from the query side of the conversation split.""" + query_msgs, _ = self._split_conversation(self.split_strategy or ConversationSplit.LAST_TURN) + user_texts = [m.text for m in query_msgs if m.role == "user" and m.text] + return " ".join(user_texts).strip() + + @property + def response(self) -> str: + """Agent response text, derived from the response side of the conversation split.""" + _, response_msgs = self._split_conversation(self.split_strategy or ConversationSplit.LAST_TURN) + assistant_texts = [m.text for m in response_msgs if m.role == "assistant" and m.text] + return " ".join(assistant_texts).strip() + + def _split_conversation(self, split: ConversationSplitter) -> tuple[list[Message], list[Message]]: + """Split ``self.conversation`` into (query_messages, response_messages).""" + if callable(split) and not isinstance(split, ConversationSplit): + return split(self.conversation) + return _BUILT_IN_SPLITTERS[split](self.conversation) + + def split_messages( + self, + split: ConversationSplitter | None = None, + ) -> tuple[list[Message], list[Message]]: + """Split the conversation into (query_messages, response_messages). + + Resolution order: explicit *split*, then ``self.split_strategy``, + then ``ConversationSplit.LAST_TURN``. + """ + effective = split or self.split_strategy or ConversationSplit.LAST_TURN + return self._split_conversation(effective) + + @staticmethod + def _split_last_turn_static( + conversation: list[Message], + ) -> tuple[list[Message], list[Message]]: + """Split at the last user message. Usable as a fallback in custom splitters.""" + return _split_last_turn(conversation) + + @staticmethod + def per_turn_items( + conversation: list[Message], + *, + tools: list[FunctionTool] | None = None, + context: str | None = None, + ) -> list[EvalItem]: + """Split a multi-turn conversation into one ``EvalItem`` per turn. + + Each user message starts a new turn. The resulting ``EvalItem`` + has cumulative context: ``query_messages`` contains the full + conversation up to and including that user message, and + ``response_messages`` contains the agent's actions up to the next + user message. This lets you evaluate each response independently + with its full preceding context. + + Args: + conversation: Full conversation as ``Message`` objects. + tools: Tool objects shared across all items. + context: Optional grounding context shared across all items. + + Returns: + A list of ``EvalItem`` instances, one per user turn. + """ + user_indices = [i for i, m in enumerate(conversation) if m.role == "user"] + if not user_indices: + return [] + + items: list[EvalItem] = [] + for turn_idx, _ui in enumerate(user_indices): + # Response runs from after the user message to the next user + # message (or end of conversation). + next_ui = user_indices[turn_idx + 1] if turn_idx + 1 < len(user_indices) else len(conversation) + + items.append( + EvalItem( + conversation=conversation[:next_ui], + tools=tools, + context=context, + ) + ) + + return items + + +# endregion + +# region Score and result types + + +@dataclass +class EvalScoreResult: + """Result from a single evaluator on a single item. + + Attributes: + name: Evaluator name (e.g. ``"relevance"``). + score: Numeric score from the evaluator. + passed: Whether the item passed this evaluator's threshold. + sample: Optional raw evaluator output (rationale, metadata). + """ + + name: str + score: float + passed: bool | None = None + sample: dict[str, Any] | None = None + + +@dataclass +class EvalItemResult: + """Per-item result from an evaluation run. + + Attributes: + item_id: Provider-assigned item identifier. + status: ``"pass"``, ``"fail"``, or ``"error"``. + scores: Per-evaluator results for this item. + error_code: Error category when ``status == "error"`` + (e.g. ``"QueryExtractionError"``). + error_message: Human-readable error detail. + response_id: Responses API response ID, if applicable. + input_text: The query/input that was evaluated. + output_text: The response/output that was evaluated. + token_usage: Token counts (``prompt_tokens``, + ``completion_tokens``, ``total_tokens``). + metadata: Additional provider-specific data. + """ + + item_id: str + status: str + scores: list[EvalScoreResult] = field(default_factory=lambda: list[EvalScoreResult]()) + error_code: str | None = None + error_message: str | None = None + response_id: str | None = None + input_text: str | None = None + output_text: str | None = None + token_usage: dict[str, int] | None = None + metadata: dict[str, Any] | None = None + + @property + def is_error(self) -> bool: + """Whether this item errored (infrastructure failure, not quality).""" + return self.status in ("error", "errored") + + @property + def is_passed(self) -> bool: + """Whether this item passed all evaluators.""" + return self.status == "pass" + + @property + def is_failed(self) -> bool: + """Whether this item failed at least one evaluator.""" + return self.status == "fail" + + +class EvalResults: + """Results from an evaluation run by a single provider. + + Attributes: + provider: Name of the evaluation provider that produced these results. + eval_id: The evaluation definition ID (provider-specific). + run_id: The evaluation run ID (provider-specific). + status: Run status - ``"completed"``, ``"failed"``, ``"canceled"``, + or ``"timeout"`` if polling exceeded the deadline. + result_counts: Pass/fail counts, populated when completed. + report_url: URL to view results in the provider's portal. + error: Error details when the run failed. + per_evaluator: Per-evaluator result counts, keyed by evaluator name. + items: Per-item results with individual pass/fail/error status, + evaluator scores, error details, and token usage. Populated + when the provider supports per-item retrieval (e.g. Foundry + ``output_items`` API). + sub_results: Per-agent breakdown for workflow evaluations, keyed by + agent/executor name. + + Example: + + .. code-block:: python + + results = await evaluate_agent(agent=my_agent, queries=["Hello"], evaluators=evals) + for r in results: + print(f"{r.provider}: {r.passed}/{r.total}") + + # Per-item detail + for item in r.items: + print(f" {item.item_id}: {item.status}") + for score in item.scores: + print(f" {score.name}: {score.score} ({'pass' if score.passed else 'fail'})") + if item.is_error: + print(f" Error: {item.error_code} - {item.error_message}") + + # Workflow eval - per-agent breakdown + for r in results: + for name, sub in r.sub_results.items(): + print(f" {name}: {sub.passed}/{sub.total}") + """ + + def __init__( + self, + *, + provider: str, + eval_id: str = "", + run_id: str = "", + status: str = "completed", + result_counts: dict[str, int] | None = None, + report_url: str | None = None, + error: str | None = None, + per_evaluator: dict[str, dict[str, int]] | None = None, + items: list[EvalItemResult] | None = None, + sub_results: dict[str, EvalResults] | None = None, + ) -> None: + self.provider = provider + self.eval_id = eval_id + self.run_id = run_id + self.status = status + self.result_counts = result_counts + self.report_url = report_url + self.error = error + self.per_evaluator = per_evaluator or {} + self.items = items or [] + self.sub_results = sub_results or {} + + @property + def passed(self) -> int: + """Number of passing results.""" + return (self.result_counts or {}).get("passed", 0) + + @property + def failed(self) -> int: + """Number of failing results.""" + return (self.result_counts or {}).get("failed", 0) + + @property + def total(self) -> int: + """Total number of results (passed + failed).""" + return self.passed + self.failed + + @property + def all_passed(self) -> bool: + """Whether all results passed with no failures or errors. + + For workflow evals with sub-agents, checks that all sub-results passed. + Returns ``False`` if the run did not complete successfully. + """ + if self.status not in ("completed",): + return False + errored = (self.result_counts or {}).get("errored", 0) + own_passed = self.failed == 0 and errored == 0 and self.total > 0 if self.result_counts else True + if self.sub_results: + return own_passed and all(sub.all_passed for sub in self.sub_results.values()) + return self.failed == 0 and errored == 0 and self.total > 0 + + def raise_for_status(self, msg: str | None = None) -> None: + """Raise ``EvalNotPassedError`` if any results failed or errored. + + Similar to ``requests.Response.raise_for_status()`` — call after + evaluation to verify quality in CI pipelines or test suites. + + Args: + msg: Optional custom failure message. + + Raises: + EvalNotPassedError: When any results failed or errored. + """ + if not self.all_passed: + errored = (self.result_counts or {}).get("errored", 0) + detail = msg or ( + f"Eval run {self.run_id} {self.status}: " + f"{self.passed} passed, {self.failed} failed." + ) + if errored: + detail += f" {errored} errored." + if self.report_url: + detail += f" See {self.report_url} for details." + if self.error: + detail += f" Error: {self.error}" + if self.sub_results: + failed = [name for name, sub in self.sub_results.items() if not sub.all_passed] + if failed: + detail += f" Failed: {', '.join(failed)}." + if self.items: + errored_items = [i for i in self.items if i.is_error] + if errored_items: + summaries = [f"{i.item_id}: {i.error_code or 'unknown'}" for i in errored_items] + detail += f" Errored items: {', '.join(summaries)}." + raise EvalNotPassedError(detail) + + +# endregion + +# region Evaluator protocol + + +@runtime_checkable +class Evaluator(Protocol): + """Protocol for evaluation providers. + + Any evaluation backend (Azure AI Foundry, local LLM-as-judge, custom + scorers, etc.) implements this protocol. The provider encapsulates all + connection details, evaluator selection, and execution logic. + + Example implementation: + + .. code-block:: python + + class MyEvaluator: + def __init__(self, name: str = "my-evaluator"): + self.name = name + + async def evaluate(self, items: Sequence[EvalItem], *, eval_name: str = "Eval") -> EvalResults: + # Score each item and return results + ... + """ + + name: str + + async def evaluate( + self, + items: Sequence[EvalItem], + *, + eval_name: str, + ) -> EvalResults: + """Evaluate a batch of items and return results. + + The evaluator determines which metrics to run. It may auto-detect + capabilities from the items (e.g., run tool evaluators only when + ``tools`` is present). + + Args: + items: Eval data items to score. + eval_name: Display name for the evaluation run. + + Returns: + ``EvalResults`` with status, counts, and optional portal link. + """ + ... + + +# endregion + +# region Converter + + +class AgentEvalConverter: + """Converts agent-framework types to evaluation format. + + Handles the type gap between agent-framework's ``Message`` / ``Content`` / + ``FunctionTool`` types and the OpenAI-style agent message schema used by + evaluation providers. All methods are static — no instantiation needed. + """ + + @staticmethod + def convert_message(message: Message) -> list[dict[str, Any]]: + """Convert a single ``Message`` to Foundry agent evaluator format. + + Uses typed content lists as required by Foundry evaluators: + + .. code-block:: python + + {"role": "assistant", "content": [{"type": "tool_call", ...}]} + + A single agent-framework ``Message`` with multiple ``function_result`` + contents produces multiple output messages (one per tool result). + + Args: + message: An agent-framework ``Message``. + + Returns: + A list of Foundry-format message dicts. + """ + role = message.role + contents = message.contents or [] + + content_items: list[dict[str, Any]] = [] + tool_results: list[dict[str, Any]] = [] + + for c in contents: + if c.type == "text" and c.text: + content_items.append({"type": "text", "text": c.text}) + elif c.type == "function_call": + args = c.arguments + if isinstance(args, str): + try: + args = json.loads(args) + except (json.JSONDecodeError, TypeError): + # Sanitize to avoid leaking sensitive tool-call arguments + # to external evaluation services. + args = {"_raw_arguments": "[unparseable]"} + tc: dict[str, Any] = { + "type": "tool_call", + "tool_call_id": c.call_id or "", + "name": c.name or "", + } + if args: + tc["arguments"] = args + content_items.append(tc) + elif c.type == "function_result": + result_val = c.result + if isinstance(result_val, str): + with contextlib.suppress(json.JSONDecodeError, TypeError): + result_val = json.loads(result_val) + tool_results.append({ + "call_id": c.call_id or "", + "result": result_val, + }) + + output: list[dict[str, Any]] = [] + + if tool_results: + for tr in tool_results: + output.append({ + "role": "tool", + "tool_call_id": tr["call_id"], + "content": [{"type": "tool_result", "tool_result": tr["result"]}], + }) + elif content_items: + output.append({"role": role, "content": content_items}) + else: + output.append({ + "role": role, + "content": [{"type": "text", "text": ""}], + }) + + return output + + @staticmethod + def convert_messages(messages: Sequence[Message]) -> list[dict[str, Any]]: + """Convert a sequence of ``Message`` objects to Foundry evaluator format. + + Args: + messages: Agent-framework messages. + + Returns: + A list of Foundry-format message dicts with typed content lists. + """ + result: list[dict[str, Any]] = [] + for msg in messages: + result.extend(AgentEvalConverter.convert_message(msg)) + return result + + @staticmethod + def extract_tools(agent: Any) -> list[dict[str, Any]]: + """Extract tool definitions from an agent instance. + + Reads ``agent.default_options["tools"]`` and ``agent.mcp_tools`` + and converts each ``FunctionTool`` to ``{name, description, parameters}``. + + Args: + agent: An agent-framework agent instance. + + Returns: + A list of tool definition dicts. + """ + tools: list[dict[str, Any]] = [] + seen: set[str] = set() + raw_tools = getattr(agent, "default_options", {}).get("tools", []) + for t in raw_tools: + if isinstance(t, FunctionTool) and t.name not in seen: + tools.append({ + "name": t.name, + "description": t.description, + "parameters": t.parameters(), + }) + seen.add(t.name) + # Include tools from connected MCP servers + for mcp in getattr(agent, "mcp_tools", []): + for t in getattr(mcp, "functions", []): + if isinstance(t, FunctionTool) and t.name not in seen: + tools.append({ + "name": t.name, + "description": t.description, + "parameters": t.parameters(), + }) + seen.add(t.name) + return tools + + @staticmethod + def to_eval_item( + *, + query: str | Sequence[Message], + response: AgentResponse[Any], + agent: Any | None = None, + tools: Sequence[FunctionTool] | None = None, + context: str | None = None, + ) -> EvalItem: + """Convert a complete agent interaction to an ``EvalItem``. + + Args: + query: The user query string, or input messages. + response: The agent's response. + agent: Optional agent instance to auto-extract tool definitions. + tools: Explicit tool list (takes precedence over *agent*). + context: Optional context document for groundedness evaluation. + + Returns: + An ``EvalItem`` suitable for passing to any ``Evaluator``. + """ + input_msgs = [Message("user", [query])] if isinstance(query, str) else list(query) + + all_msgs = list(input_msgs) + list(response.messages or []) + + typed_tools: list[FunctionTool] = [] + if tools: + typed_tools = list(tools) + elif agent: + raw_tools = getattr(agent, "default_options", {}).get("tools", []) + typed_tools = [t for t in raw_tools if isinstance(t, FunctionTool)] + # Include tools from connected MCP servers + seen = {t.name for t in typed_tools} + for mcp in getattr(agent, "mcp_tools", []): + for t in getattr(mcp, "functions", []): + if isinstance(t, FunctionTool) and t.name not in seen: + typed_tools.append(t) + seen.add(t.name) + + return EvalItem( + conversation=all_msgs, + tools=typed_tools or None, + context=context, + ) + + +# endregion + +# region Workflow extraction helpers + + +class _AgentEvalData(TypedDict): + executor_id: str + query: str | Sequence[Message] + response: AgentResponse[Any] + agent: Any | None + + +def _extract_agent_eval_data( + workflow_result: WorkflowRunResult, + workflow: Workflow | None = None, +) -> list[_AgentEvalData]: + """Walk a WorkflowRunResult and extract per-agent query/response pairs. + + Pairs ``executor_invoked`` with ``executor_completed`` events for each + ``AgentExecutor``. Skips internal framework executors. + """ + from ._workflows._agent_executor import AgentExecutor as AE + from ._workflows._agent_executor import AgentExecutorResponse + + invoked_data: dict[str, Any] = {} + results: list[_AgentEvalData] = [] + + for event in workflow_result: + if event.type == "executor_invoked" and event.executor_id: + invoked_data[event.executor_id] = event.data + + elif event.type == "executor_completed" and event.executor_id: + executor_id = event.executor_id + + # Skip internal framework executors + if executor_id.startswith("_") or executor_id.lower() in {"input-conversation", "end-conversation", "end"}: + logger.debug("Skipping internal executor %r during eval data extraction", executor_id) + continue + + completion_data: Any = event.data + agent_exec_response: AgentExecutorResponse | None = None + + if isinstance(completion_data, list): + for cdata_item in cast(list[Any], completion_data): # type: ignore[redundant-cast] + if isinstance(cdata_item, AgentExecutorResponse): + agent_exec_response = cdata_item + break + elif isinstance(completion_data, AgentExecutorResponse): + agent_exec_response = completion_data + + if agent_exec_response is None: + continue + + query: str | list[Message] + if agent_exec_response.full_conversation: + user_msgs = [m for m in agent_exec_response.full_conversation if m.role == "user"] + query = user_msgs or agent_exec_response.full_conversation # type: ignore[assignment] + elif executor_id in invoked_data: + input_data: Any = invoked_data[executor_id] + query = ( # type: ignore[assignment] + input_data if isinstance(input_data, (str, list)) else str(input_data) + ) + else: + continue + + agent_ref = None + if workflow is not None: + executor = workflow.executors.get(executor_id) + if executor is not None and isinstance(executor, AE): + agent_ref = executor.agent + + results.append( + _AgentEvalData( + executor_id=executor_id, + query=query, + response=agent_exec_response.agent_response, + agent=agent_ref, + ) + ) + + return results + + +def _extract_overall_query(workflow_result: WorkflowRunResult) -> str | None: + """Extract the original user query from a workflow result.""" + for event in workflow_result: + if event.type == "executor_invoked" and event.data is not None: + data: Any = event.data + if isinstance(data, str): + return data + if isinstance(data, list) and data: + items_list = cast(list[Any], data) # type: ignore[redundant-cast] + first = items_list[0] + if isinstance(first, Message): + msgs: list[Message] = [m for m in items_list if isinstance(m, Message)] + return " ".join(str(m.text) for m in msgs if hasattr(m, "text") and m.role == "user") + if isinstance(first, str): + return " ".join(str(s) for s in items_list) + return str(data) # type: ignore[reportUnknownArgumentType] + return None + + +# endregion + +# region Local evaluation checks + + +@dataclass +class CheckResult: + """Result of a single check on a single evaluation item. + + Attributes: + passed: Whether the check passed. + reason: Human-readable explanation. + check_name: Name of the check that produced this result. + """ + + passed: bool + reason: str + check_name: str + + +EvalCheck = Callable[[EvalItem], CheckResult | Awaitable[CheckResult]] +"""A check function that takes an ``EvalItem`` and returns a ``CheckResult``. + +Both sync and async functions are supported. Async checks should return +an awaitable ``CheckResult``; they will be awaited automatically by +``LocalEvaluator``. +""" + + +def keyword_check(*keywords: str, case_sensitive: bool = False) -> EvalCheck: + """Check that the response contains all specified keywords. + + Args: + *keywords: Required keywords that must appear in the response. + case_sensitive: Whether matching is case-sensitive (default ``False``). + + Returns: + A check function for use with ``LocalEvaluator``. + + Example: + + .. code-block:: python + + check = keyword_check("weather", "temperature") + """ + + def _check(item: EvalItem) -> CheckResult: + text = item.response if case_sensitive else item.response.lower() + missing = [k for k in keywords if (k if case_sensitive else k.lower()) not in text] + if missing: + return CheckResult(passed=False, reason=f"Missing keywords: {missing}", check_name="keyword_check") + return CheckResult(passed=True, reason="All keywords found", check_name="keyword_check") + + return _check + + +def tool_called_check(*tool_names: str, mode: Literal["all", "any"] = "all") -> EvalCheck: + """Check that specific tools were called during the conversation. + + Inspects the conversation history for ``tool_calls`` entries matching + the expected tool names. + + Args: + *tool_names: Names of tools that should have been called. + mode: ``"all"`` requires every tool to be called; ``"any"`` requires + at least one. Defaults to ``"all"``. + + Returns: + A check function for use with ``LocalEvaluator``. + + Example: + + .. code-block:: python + + check = tool_called_check("get_weather", "get_flight_price") + """ + + def _check(item: EvalItem) -> CheckResult: + expected = set(tool_names) + called: set[str] = set() + for msg in item.conversation: + for c in msg.contents or []: + if c.type == "function_call" and c.name: + called.add(c.name) + if mode == "all" and expected.issubset(called): + return CheckResult( + passed=True, + reason=f"All expected tools called: {sorted(called)}", + check_name="tool_called", + ) + if mode == "any" and expected & called: + return CheckResult( + passed=True, + reason=f"Expected tool found: {sorted(expected & called)}", + check_name="tool_called", + ) + if mode == "all": + missing = [t for t in tool_names if t not in called] + if missing: + return CheckResult( + passed=False, + reason=f"Expected tools not called: {missing} (called: {sorted(called)})", + check_name="tool_called", + ) + return CheckResult( + passed=True, + reason=f"All expected tools called: {sorted(called)}", + check_name="tool_called", + ) + return CheckResult( + passed=False, + reason=f"None of expected tools called: {list(tool_names)} (called: {sorted(called)})", + check_name="tool_called", + ) + + return _check + + +def _extract_tool_calls(item: EvalItem) -> list[tuple[str, dict[str, Any] | None]]: + """Extract (name, arguments) pairs from the conversation's function calls.""" + calls: list[tuple[str, dict[str, Any] | None]] = [] + for msg in item.conversation: + for c in msg.contents or []: + if c.type == "function_call" and c.name: + args: dict[str, Any] | None = None + if isinstance(c.arguments, dict): + args = c.arguments + elif isinstance(c.arguments, str): + try: + parsed = json.loads(c.arguments) + if isinstance(parsed, dict): + args = cast(dict[str, Any], parsed) + except (json.JSONDecodeError, TypeError): + pass + calls.append((c.name, args)) + return calls + + +def tool_calls_present(item: EvalItem) -> CheckResult: + """Check that all expected tool calls were made (unordered, extras OK). + + Uses ``item.expected_tool_calls`` — checks that every expected tool name + appears at least once in the conversation. Does not check arguments or + ordering. Extra (unexpected) tool calls are not penalized. + + Example: + + .. code-block:: python + + local = LocalEvaluator(tool_calls_present) + results = await evaluate_agent( + agent=agent, + queries=["What's the weather?"], + expected_tool_calls=[[ExpectedToolCall("get_weather")]], + evaluators=local, + ) + """ + expected = item.expected_tool_calls or [] + if not expected: + return CheckResult(passed=True, reason="No expected tool calls specified.", check_name="tool_calls_present") + + actual_names = {name for name, _ in _extract_tool_calls(item)} + expected_names = [e.name for e in expected] + found = [n for n in expected_names if n in actual_names] + missing = [n for n in expected_names if n not in actual_names] + + if missing: + return CheckResult( + passed=False, + reason=f"Missing tool calls: {missing} (called: {sorted(actual_names)})", + check_name="tool_calls_present", + ) + return CheckResult( + passed=True, + reason=f"All expected tools called: {found} (called: {sorted(actual_names)})", + check_name="tool_calls_present", + ) + + +def tool_call_args_match(item: EvalItem) -> CheckResult: + """Check that expected tool calls match on name and arguments. + + For each expected tool call, finds matching calls in the conversation + by name. If ``ExpectedToolCall.arguments`` is provided, checks that + the actual arguments contain all expected key-value pairs (subset + match — extra actual arguments are OK). + + Example: + + .. code-block:: python + + local = LocalEvaluator(tool_call_args_match) + results = await evaluate_agent( + agent=agent, + queries=["What's the weather in NYC?"], + expected_tool_calls=[ + [ExpectedToolCall("get_weather", {"location": "NYC"})], + ], + evaluators=local, + ) + """ + expected = item.expected_tool_calls or [] + if not expected: + return CheckResult(passed=True, reason="No expected tool calls specified.", check_name="tool_call_args_match") + + actual_calls = _extract_tool_calls(item) + matched = 0 + details: list[str] = [] + + for exp in expected: + matching = [(n, a) for n, a in actual_calls if n == exp.name] + if not matching: + details.append(f" {exp.name}: not called") + continue + + if exp.arguments is None: + matched += 1 + details.append(f" {exp.name}: called (args not checked)") + continue + + # Subset match — all expected keys present with expected values + found = False + for _, actual_args in matching: + if actual_args is None: + continue + if all(actual_args.get(k) == v for k, v in exp.arguments.items()): + found = True + break + + if found: + matched += 1 + details.append(f" {exp.name}: args match") + else: + actual_args_list = [a for _, a in matching] + details.append(f" {exp.name}: args mismatch (actual: {actual_args_list})") + + passed = matched == len(expected) + score_str = f"{matched}/{len(expected)}" + detail_str = "\n".join(details) + reason = f"Tool call args match: {score_str}\n{detail_str}" + + return CheckResult(passed=passed, reason=reason, check_name="tool_call_args_match") + + +# endregion + +# region Function evaluator — wrap plain functions as EvalChecks + +# Parameters recognized by the function evaluator wrapper +_KNOWN_PARAMS = frozenset({ + "query", + "response", + "expected_output", + "expected_tool_calls", + "conversation", + "tools", + "context", +}) + + +def _resolve_function_args( + fn: Callable[..., Any], + item: EvalItem, + *, + _param_names: frozenset[str] | set[str] | None = None, +) -> dict[str, Any]: + """Build a kwargs dict for *fn* based on its signature and the EvalItem. + + Supported parameter names: + + ====================== ==================================================== + Name Value from EvalItem + ====================== ==================================================== + query ``item.query`` + response ``item.response`` + expected_output ``item.expected_output`` (empty string if not set) + expected_tool_calls ``item.expected_tool_calls`` (empty list if not set) + conversation ``item.conversation`` (list[Message]) + tools ``item.tools`` (typed ``FunctionTool`` objects) + context ``item.context`` + ====================== ==================================================== + + Parameters with default values are only supplied when their name is + recognised. Unknown required parameters raise ``TypeError``. + + When called from the ``@evaluator`` wrapper the pre-computed + *_param_names* set avoids repeated ``inspect.signature`` calls. + """ + field_map: dict[str, Any] = { + "query": item.query, + "response": item.response, + "expected_output": item.expected_output or "", + "expected_tool_calls": item.expected_tool_calls or [], + "conversation": item.conversation, + "tools": item.tools, + "context": item.context, + } + + if _param_names is not None: + return {k: field_map[k] for k in _param_names if k in field_map} + + # Fallback: introspect at call time (for direct callers) + sig = inspect.signature(fn) + kwargs: dict[str, Any] = {} + + for name, param in sig.parameters.items(): + if name in field_map: + kwargs[name] = field_map[name] + elif param.default is inspect.Parameter.empty: + raise TypeError( + f"Function evaluator '{fn.__name__}' has unknown required parameter " + f"'{name}'. Supported: {sorted(_KNOWN_PARAMS)}" + ) + # else: has a default — leave it to Python + + return kwargs + + +def _coerce_result(value: Any, check_name: str) -> CheckResult: + """Convert a function evaluator return value to a ``CheckResult``. + + Accepted return types: + + * ``bool`` — True/False maps directly to pass/fail. + * ``int | float`` — ≥ 0.5 is pass (score is included in reason). + * ``CheckResult`` — returned as-is. + * ``dict`` with ``score`` or ``passed`` key — converted to CheckResult. + """ + if isinstance(value, CheckResult): + return value + + if isinstance(value, bool): + return CheckResult(passed=value, reason="passed" if value else "failed", check_name=check_name) + + if isinstance(value, (int, float)): + passed = value >= 0.5 + return CheckResult(passed=passed, reason=f"score={value:.3f}", check_name=check_name) + + if isinstance(value, dict): + d = cast(dict[str, Any], value) + if "score" in d: + try: + score = float(d["score"]) + except (TypeError, ValueError) as exc: + raise TypeError( + f"Function evaluator '{check_name}' returned dict with non-numeric 'score' value: {d['score']!r}" + ) from exc + passed = score >= float(d.get("threshold", 0.5)) + reason = str(d.get("reason", f"score={score:.3f}")) + return CheckResult(passed=passed, reason=reason, check_name=check_name) + if "passed" in d: + passed_val = d["passed"] + if not isinstance(passed_val, (bool, int)): + raise TypeError( + f"Function evaluator '{check_name}' returned dict with non-boolean 'passed' value: {passed_val!r}" + ) + return CheckResult( + passed=bool(passed_val), + reason=str(d.get("reason", "passed" if passed_val else "failed")), + check_name=check_name, + ) + + value_type_name = type(value).__name__ # type: ignore[reportUnknownMemberType] + msg = ( + f"Function evaluator '{check_name}' returned unsupported type " + f"{value_type_name}. Expected bool, float, dict, or CheckResult." + ) + raise TypeError(msg) + + +@overload +def evaluator(fn: Callable[..., Any], /) -> EvalCheck: ... + + +@overload +def evaluator(*, name: str | None = None) -> Callable[[Callable[..., Any]], EvalCheck]: ... + + +def evaluator( + fn: Callable[..., Any] | None = None, + *, + name: str | None = None, +) -> EvalCheck | Callable[[Callable[..., Any]], EvalCheck]: + """Wrap a plain function as an ``EvalCheck`` for use with ``LocalEvaluator``. + + Works with both sync and async functions. The function's parameter names + determine what data it receives from the ``EvalItem``. Any combination of + the following parameter names is valid: + + * ``query`` — the user query (str) + * ``response`` — the agent response (str) + * ``expected_output`` — expected output for ground-truth comparison (str) + * ``conversation`` — full conversation history (list[Message]) + * ``tools`` — typed tool objects (list[FunctionTool]) + * ``context`` — grounding context (str | None) + + Return ``bool``, ``float`` (≥0.5 = pass), ``dict`` with ``score`` or + ``passed`` key, or ``CheckResult``. + + Can be used as a decorator (with or without arguments) or called directly: + + .. code-block:: python + + # Decorator — no args + @evaluator + def mentions_weather(query: str, response: str) -> bool: + return "weather" in response.lower() + + + # Decorator — with name + @evaluator(name="length_check") + def is_not_too_long(response: str) -> bool: + return len(response) < 2000 + + + # Direct wrapping + check = evaluator(my_scorer, name="my_scorer") + + + # Async function — handled automatically + @evaluator + async def llm_judge(query: str, response: str) -> float: + result = await my_llm_client.score(query, response) + return result.score + + + # Use with LocalEvaluator + local = LocalEvaluator(mentions_weather, is_not_too_long, check, llm_judge) + + Args: + fn: The function to wrap. If omitted, returns a decorator. + name: Display name for the check (defaults to ``fn.__name__``). + """ + + def _wrap(func: Callable[..., Any]) -> EvalCheck: + check_name: str = name or getattr(func, "__name__", None) or "evaluator" + # Cache signature introspection once per wrapped function + sig = inspect.signature(func) + param_names = { + n for n, p in sig.parameters.items() if n in _KNOWN_PARAMS or p.default is inspect.Parameter.empty + } + required_unknown = { + n for n, p in sig.parameters.items() if n not in _KNOWN_PARAMS and p.default is inspect.Parameter.empty + } + if required_unknown: + raise TypeError( + f"Function evaluator '{func.__name__}' has unknown required parameter(s) " + f"{sorted(required_unknown)}. Supported: {sorted(_KNOWN_PARAMS)}" + ) + + async def _check(item: EvalItem) -> CheckResult: + kwargs = _resolve_function_args(func, item, _param_names=param_names) + result = func(**kwargs) + if inspect.isawaitable(result): + result = await result + return _coerce_result(value=result, check_name=check_name) + + _check.__name__ = check_name # type: ignore[attr-defined,assignment] + _check.__doc__ = func.__doc__ + return _check + + # Support @evaluator (no parens) and @evaluator(name="x") + if fn is not None: + return _wrap(fn) + return _wrap + + +# endregion + +# region LocalEvaluator + + +async def _run_check(check_fn: EvalCheck, item: EvalItem) -> CheckResult: + """Run a single check, awaiting the result if it is a coroutine.""" + result = check_fn(item) + if inspect.isawaitable(result): + result = await result + return result + + +class LocalEvaluator: + """Evaluation provider that runs checks locally without API calls. + + Implements the ``Evaluator`` protocol. Each check function is applied + to every item. An item passes only if all checks pass. + + Examples: + Basic usage: + + .. code-block:: python + + from agent_framework import LocalEvaluator, keyword_check, evaluate_agent + + local = LocalEvaluator( + keyword_check("weather"), + tool_called_check("get_weather"), + ) + results = await evaluate_agent(agent=agent, queries=queries, evaluators=local) + + Mixing with cloud evaluators: + + .. code-block:: python + + from agent_framework.foundry import FoundryEvals + + results = await evaluate_agent( + agent=agent, + queries=queries, + evaluators=[local, FoundryEvals(project_client=client, model="gpt-4o")], + ) + """ + + def __init__(self, *checks: EvalCheck): + self.name = "Local" + self._checks = checks + + async def evaluate( + self, + items: Sequence[EvalItem], + *, + eval_name: str = "Local Eval", + ) -> EvalResults: + """Run all checks on each item and return aggregated results. + + An item passes only if every check passes for that item. Per-check + breakdowns are available in ``per_evaluator``. + + Supports both sync and async check functions (from + :func:`evaluator`). + """ + passed = 0 + failed = 0 + per_check: dict[str, dict[str, int]] = {} + failure_reasons: list[str] = [] + result_items: list[EvalItemResult] = [] + + for item_idx, item in enumerate(items): + check_results = await asyncio.gather(*[_run_check(fn, item) for fn in self._checks]) + item_passed = True + item_scores: list[EvalScoreResult] = [] + for result in check_results: + counts = per_check.setdefault(result.check_name, {"passed": 0, "failed": 0, "errored": 0}) + if result.passed: + counts["passed"] += 1 + else: + counts["failed"] += 1 + item_passed = False + failure_reasons.append(f"{result.check_name}: {result.reason}") + item_scores.append( + EvalScoreResult( + name=result.check_name, + score=1.0 if result.passed else 0.0, + passed=result.passed, + sample={"reason": result.reason} if result.reason else None, + ) + ) + + if item_passed: + passed += 1 + else: + failed += 1 + + result_items.append( + EvalItemResult( + item_id=str(item_idx), + status="pass" if item_passed else "fail", + scores=item_scores, + input_text=item.query, + output_text=item.response, + ) + ) + + return EvalResults( + provider=self.name, + eval_id="local", + run_id=eval_name, + status="completed", + result_counts={"passed": passed, "failed": failed, "errored": 0}, + per_evaluator=per_check, + items=result_items, + error="; ".join(failure_reasons) if failure_reasons else None, + ) + + +# endregion + +# region Public orchestration functions + + +async def evaluate_agent( + *, + agent: SupportsAgentRun | None = None, + queries: str | Sequence[str] | None = None, + expected_output: str | Sequence[str] | None = None, + expected_tool_calls: Sequence[ExpectedToolCall] | Sequence[Sequence[ExpectedToolCall]] | None = None, + responses: AgentResponse[Any] | Sequence[AgentResponse[Any]] | None = None, + evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]], + eval_name: str | None = None, + context: str | None = None, + conversation_split: ConversationSplitter | None = None, + num_repetitions: int = 1, +) -> list[EvalResults]: + """Run an agent against test queries and evaluate the results. + + The simplest path for evaluating an agent during development. For each + query, runs the agent, converts the interaction to eval format, and + submits to the evaluator(s). + + All sequence parameters (``queries``, ``expected_output``, + ``expected_tool_calls``, ``responses``) accept either a single value + or a list for convenience. + + If ``responses`` is provided, skips running the agent and evaluates those + responses directly — but still extracts tool definitions from the agent. + In this mode ``queries`` is required to construct the conversation. + + Args: + agent: An agent-framework agent instance. + queries: Test query or queries to run the agent against. A single + string is wrapped into a one-element list. Required when + ``responses`` is not provided. + expected_output: Ground-truth expected output(s), one per query. A + single string is wrapped into a one-element list. When provided, + must be the same length as ``queries``. Each value is stamped on + the corresponding ``EvalItem.expected_output`` for evaluators + that compare against a reference answer. + expected_tool_calls: Expected tool call(s), one list per query. A + single flat list of ``ExpectedToolCall`` is wrapped into a + one-element nested list. When provided, must be the same length + as ``queries``. + responses: Pre-existing ``AgentResponse``(s) to evaluate without + running the agent. A single response is wrapped into a one-element + list. When provided, ``queries`` must also be provided to + construct the conversation for evaluation. + evaluators: One or more ``Evaluator`` instances. + eval_name: Display name (defaults to agent name). + context: Optional context for groundedness evaluation. + conversation_split: Split strategy applied to all items, overriding + each evaluator's default. See ``ConversationSplitter``. + num_repetitions: Number of times to run each query (default 1). + When > 1, each query is invoked independently N times to measure + consistency. Results contain all N x len(queries) items. + Ignored when ``responses`` is provided (pre-existing responses + are evaluated as-is). + + Returns: + A list of ``EvalResults``, one per evaluator provider. + + Raises: + ValueError: If neither ``queries`` nor ``responses`` is provided. + + Examples: + Run and evaluate: + + .. code-block:: python + + results = await evaluate_agent( + agent=my_agent, + queries="What's the weather?", + evaluators=evals, + ) + + Evaluate existing responses: + + .. code-block:: python + + response = await agent.run([Message("user", ["What's the weather?"])]) + results = await evaluate_agent( + agent=agent, + responses=response, + queries="What's the weather?", + evaluators=evals, + ) + + With ground-truth expected answers: + + .. code-block:: python + + results = await evaluate_agent( + agent=my_agent, + queries=["What's 2+2?", "Capital of France?"], + expected_output=["4", "Paris"], + evaluators=evals, + ) + + With expected tool calls: + + .. code-block:: python + + results = await evaluate_agent( + agent=my_agent, + queries="What's the weather in NYC?", + expected_tool_calls=[ExpectedToolCall("get_weather", {"location": "NYC"})], + evaluators=evals, + ) + """ + # Normalize singular values to lists + if isinstance(queries, str): + queries = [queries] + if isinstance(expected_output, str): + expected_output = [expected_output] + if isinstance(responses, AgentResponse): + responses = [responses] + if ( + expected_tool_calls is not None + and len(expected_tool_calls) > 0 + and isinstance(expected_tool_calls[0], ExpectedToolCall) + ): + expected_tool_calls = [list(cast(Sequence[ExpectedToolCall], expected_tool_calls))] + + items: list[EvalItem] = [] + + # Validate num_repetitions + if num_repetitions < 1: + raise ValueError(f"num_repetitions must be >= 1, got {num_repetitions}.") + + # Validate expected_output length against queries + if expected_output is not None and queries is not None and len(expected_output) != len(queries): + raise ValueError(f"Got {len(queries)} queries but {len(expected_output)} expected_output values.") + + # Validate expected_tool_calls length against queries + if expected_tool_calls is not None and queries is not None and len(expected_tool_calls) != len(queries): + raise ValueError(f"Got {len(queries)} queries but {len(expected_tool_calls)} expected_tool_calls lists.") + + if responses is not None: + # Evaluate pre-existing responses (don't run the agent) + resp_list = list(responses) + + if queries is not None: + query_list = list(queries) + if len(query_list) != len(resp_list): + raise ValueError(f"Got {len(query_list)} queries but {len(resp_list)} responses.") + for q, r in zip(query_list, resp_list): + items.append( + AgentEvalConverter.to_eval_item( + query=q, + response=r, + agent=agent, + context=context, + ) + ) + else: + raise ValueError( + "Provide 'queries' alongside 'responses' so the conversation " + "can be constructed for evaluation. For Responses API " + "evaluation by response ID, use evaluate_traces(response_ids=...) from " + "the azure-ai package." + ) + elif queries is not None and agent is not None: + # Run the agent against test queries, with repetitions + for _rep in range(num_repetitions): + for query in queries: + response = await agent.run([Message("user", [query])]) + items.append( + AgentEvalConverter.to_eval_item( + query=query, + response=response, + agent=agent, + context=context, + ) + ) + elif queries is not None and agent is None: + raise ValueError( + "Provide 'agent' when using 'queries' to run the agent. " + "To evaluate pre-existing responses without an agent, use 'responses=' instead." + ) + else: + raise ValueError("Provide either 'queries' (with 'agent') or 'responses' (or both).") + + # Stamp expected output values on items (repeated across all repetitions) + if expected_output is not None: + query_count = len(expected_output) + for i, item in enumerate(items): + item.expected_output = expected_output[i % query_count] + + # Stamp expected tool calls on items (repeated across all repetitions) + if expected_tool_calls is not None: + # After normalization, expected_tool_calls is Sequence[Sequence[ExpectedToolCall]] + tc_list = cast(Sequence[Sequence[ExpectedToolCall]], expected_tool_calls) + query_count = len(tc_list) + for i, item in enumerate(items): + item.expected_tool_calls = list(tc_list[i % query_count]) + + # Stamp split strategy on items so evaluators respect it + if conversation_split is not None: + for item in items: + item.split_strategy = conversation_split + + name = eval_name or f"Eval: {getattr(agent, 'name', None) or getattr(agent, 'id', 'agent') if agent else 'agent'}" + return await _run_evaluators(evaluators, items, eval_name=name) + + +async def evaluate_workflow( + *, + workflow: Workflow, + workflow_result: WorkflowRunResult | None = None, + queries: str | Sequence[str] | None = None, + evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]], + eval_name: str | None = None, + include_overall: bool = True, + include_per_agent: bool = True, + conversation_split: ConversationSplitter | None = None, + num_repetitions: int = 1, +) -> list[EvalResults]: + """Evaluate a multi-agent workflow with per-agent breakdown. + + Evaluates each sub-agent individually and (optionally) the workflow's + overall output. Returns one ``EvalResults`` per evaluator provider, each + with per-agent breakdowns in ``sub_results``. + + **Two modes:** + + - **Post-hoc**: Pass ``workflow_result`` from a previous + ``workflow.run()`` call. + - **Run + evaluate**: Pass ``queries`` and the workflow will be run + against each query, then evaluated. + + Args: + workflow: The workflow instance. + workflow_result: A completed ``WorkflowRunResult``. + queries: Test queries to run through the workflow. + evaluators: One or more ``Evaluator`` instances. + eval_name: Display name for the evaluation. + include_overall: Whether to evaluate the workflow's final output. + include_per_agent: Whether to evaluate each sub-agent individually. + conversation_split: Split strategy applied to all items, overriding + each evaluator's default. See ``ConversationSplitter``. + num_repetitions: Number of times to run each query (default 1). + When > 1, each query is run independently N times. + Ignored when ``workflow_result`` is provided. + + Returns: + A list of ``EvalResults``, one per evaluator provider. + + Example: + + .. code-block:: python + + from agent_framework.foundry import FoundryEvals + + evals = FoundryEvals(project_client=client, model="gpt-4o") + result = await workflow.run("Plan a trip to Paris") + + eval_results = await evaluate_workflow( + workflow=workflow, + workflow_result=result, + evaluators=evals, + ) + for r in eval_results: + print(f"{r.provider}:") + for name, sub in r.sub_results.items(): + print(f" {name}: {sub.passed}/{sub.total}") + """ + from ._workflows._workflow import WorkflowRunResult as WRR + + # Normalize singular query to list + if isinstance(queries, str): + queries = [queries] + + if workflow_result is None and queries is None: + raise ValueError("Provide either 'workflow_result' or 'queries'.") + + if num_repetitions < 1: + raise ValueError(f"num_repetitions must be >= 1, got {num_repetitions}.") + + wf_name = eval_name or f"Workflow Eval: {workflow.__class__.__name__}" + evaluator_list = _resolve_evaluators(evaluators) + + # Collect per-agent data and overall items + all_agent_data: list[_AgentEvalData] = [] + overall_items: list[EvalItem] = [] + + if queries is not None: + results_list: list[WRR] = [] + for _rep in range(num_repetitions): + for q in queries: + result = await workflow.run(q) + if not isinstance(result, WRR): + raise TypeError(f"Expected WorkflowRunResult from workflow.run(), got {type(result).__name__}.") + results_list.append(result) + all_agent_data.extend(_extract_agent_eval_data(result, workflow)) + if include_overall: + overall_item = _build_overall_item(q, result) + if overall_item: + overall_items.append(overall_item) + else: + assert workflow_result is not None # noqa: S101 # nosec B101 + all_agent_data = _extract_agent_eval_data(workflow_result, workflow) + if include_overall: + original_query = _extract_overall_query(workflow_result) + if original_query: + overall_item = _build_overall_item(original_query, workflow_result) + if overall_item: + overall_items.append(overall_item) + + # Group agent data by executor ID + agents_by_id: dict[str, list[_AgentEvalData]] = {} + if include_per_agent and all_agent_data: + for ad in all_agent_data: + agents_by_id.setdefault(ad["executor_id"], []).append(ad) + + # Build per-agent items once (shared across providers). + agent_items_by_id: dict[str, list[EvalItem]] = {} + for executor_id, agent_data_list in agents_by_id.items(): + agent_items_by_id[executor_id] = [ + AgentEvalConverter.to_eval_item( + query=ad["query"], + response=ad["response"], + agent=ad["agent"], + ) + for ad in agent_data_list + ] + + if not agent_items_by_id and not overall_items: + raise ValueError( + "No agent executor data found in the workflow result. Ensure the workflow uses AgentExecutor-based agents." + ) + + # Stamp split strategy on all items so evaluators respect it + if conversation_split is not None: + for items in agent_items_by_id.values(): + for item in items: + item.split_strategy = conversation_split + for item in overall_items: + item.split_strategy = conversation_split + + # Run each provider, building per-agent sub_results for each + all_results: list[EvalResults] = [] + for ev in evaluator_list: + suffix = f" ({ev.name})" if len(evaluator_list) > 1 else "" + sub_results: dict[str, EvalResults] = {} + + # Per-agent evals + for executor_id, items in agent_items_by_id.items(): + agent_result = await ev.evaluate(items, eval_name=f"{wf_name} — {executor_id}{suffix}") + sub_results[executor_id] = agent_result + + # Overall eval + if include_overall and overall_items: + overall_result = await ev.evaluate(overall_items, eval_name=f"{wf_name} — overall{suffix}") + elif sub_results: + # Aggregate from sub-results + total_passed = sum(s.passed for s in sub_results.values()) + total_failed = sum(s.failed for s in sub_results.values()) + all_completed = all(s.status == "completed" for s in sub_results.values()) + overall_result = EvalResults( + provider=ev.name, + eval_id="aggregate", + run_id="aggregate", + status="completed" if all_completed else "partial", + result_counts={ + "passed": total_passed, + "failed": total_failed, + }, + ) + else: + raise ValueError( + "No agent executor data found in the workflow result. " + "Ensure the workflow uses AgentExecutor-based agents." + ) + + overall_result.sub_results = sub_results + all_results.append(overall_result) + + return all_results + + +# endregion + +# region Internal helpers + + +def _build_overall_item( + query: str, + workflow_result: WorkflowRunResult, +) -> EvalItem | None: + """Build an EvalItem for the overall workflow output.""" + outputs = workflow_result.get_outputs() + if not outputs: + return None + + final_output: Any = outputs[-1] + overall_response: AgentResponse[None] + if isinstance(final_output, list) and final_output and isinstance(final_output[0], Message): + msgs: list[Message] = [m for m in cast(list[Any], final_output) if isinstance(m, Message)] # type: ignore[redundant-cast] + response_text = " ".join(str(m.text) for m in msgs if m.role == "assistant") + overall_response = AgentResponse(messages=[Message("assistant", [response_text])]) + elif isinstance(final_output, AgentResponse): + overall_response = cast(AgentResponse[None], final_output) + else: + overall_response = AgentResponse( + messages=[Message("assistant", [str(final_output)])] # type: ignore[reportUnknownArgumentType] + ) + + return AgentEvalConverter.to_eval_item(query=query, response=overall_response) + + +def _resolve_evaluators( + evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]], +) -> list[Evaluator]: + """Normalize evaluators into a list of concrete ``Evaluator`` instances. + + Bare callables (``EvalCheck`` functions, ``@evaluator`` decorated) are + collected and wrapped in a single ``LocalEvaluator``. + """ + raw_list: list[Any] = ( + [evaluators] if isinstance(evaluators, Evaluator) or callable(evaluators) else list(evaluators) + ) + + resolved: list[Evaluator] = [] + pending_checks: list[Callable[..., Any]] = [] + + for item in raw_list: + if isinstance(item, Evaluator): + if pending_checks: + resolved.append(LocalEvaluator(*pending_checks)) + pending_checks = [] + resolved.append(item) + elif callable(item): + pending_checks.append(item) + else: + raise TypeError(f"Expected an Evaluator or callable, got {type(item).__name__}") + + if pending_checks: + resolved.append(LocalEvaluator(*pending_checks)) + + return resolved + + +async def _run_evaluators( + evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]], + items: Sequence[EvalItem], + *, + eval_name: str, +) -> list[EvalResults]: + """Run one or more evaluators and return a result per provider. + + Bare ``EvalCheck`` callables (including ``@evaluator`` decorated + functions and helpers like ``keyword_check``) are auto-wrapped in a + ``LocalEvaluator`` so they can be passed directly in the evaluators list. + """ + evaluator_list = _resolve_evaluators(evaluators) + + async def _run_single_evaluator( + ev: Evaluator, + eval_items: Sequence[EvalItem], + name: str, + suffix: str, + ) -> EvalResults: + return await ev.evaluate(eval_items, eval_name=f"{name}{suffix}") + + results = await asyncio.gather(*[ + _run_single_evaluator(ev, items, eval_name, f" ({ev.name})" if len(evaluator_list) > 1 else "") + for ev in evaluator_list + ]) + return list(results) + + +# endregion diff --git a/python/packages/core/agent_framework/foundry/__init__.py b/python/packages/core/agent_framework/foundry/__init__.py index b8092909b4..0ebf0a9389 100644 --- a/python/packages/core/agent_framework/foundry/__init__.py +++ b/python/packages/core/agent_framework/foundry/__init__.py @@ -12,6 +12,7 @@ "FoundryAgent": ("agent_framework_foundry", "agent-framework-foundry"), "FoundryChatClient": ("agent_framework_foundry", "agent-framework-foundry"), "FoundryChatOptions": ("agent_framework_foundry", "agent-framework-foundry"), + "FoundryEvals": ("agent_framework_foundry", "agent-framework-foundry"), "FoundryMemoryProvider": ("agent_framework_foundry", "agent-framework-foundry"), "FoundryLocalChatOptions": ("agent_framework_foundry_local", "agent-framework-foundry-local"), "FoundryLocalClient": ("agent_framework_foundry_local", "agent-framework-foundry-local"), @@ -19,6 +20,8 @@ "RawFoundryAgent": ("agent_framework_foundry", "agent-framework-foundry"), "RawFoundryAgentChatClient": ("agent_framework_foundry", "agent-framework-foundry"), "RawFoundryChatClient": ("agent_framework_foundry", "agent-framework-foundry"), + "evaluate_foundry_target": ("agent_framework_foundry", "agent-framework-foundry"), + "evaluate_traces": ("agent_framework_foundry", "agent-framework-foundry"), } diff --git a/python/packages/core/agent_framework/foundry/__init__.pyi b/python/packages/core/agent_framework/foundry/__init__.pyi index 22c0b38b06..534b7fa5bc 100644 --- a/python/packages/core/agent_framework/foundry/__init__.pyi +++ b/python/packages/core/agent_framework/foundry/__init__.pyi @@ -7,10 +7,13 @@ from agent_framework_foundry import ( FoundryAgent, FoundryChatClient, FoundryChatOptions, + FoundryEvals, FoundryMemoryProvider, RawFoundryAgent, RawFoundryAgentChatClient, RawFoundryChatClient, + evaluate_foundry_target, + evaluate_traces, ) from agent_framework_foundry_local import ( FoundryLocalChatOptions, @@ -22,6 +25,7 @@ __all__ = [ "FoundryAgent", "FoundryChatClient", "FoundryChatOptions", + "FoundryEvals", "FoundryLocalChatOptions", "FoundryLocalClient", "FoundryLocalSettings", @@ -29,4 +33,6 @@ __all__ = [ "RawFoundryAgent", "RawFoundryAgentChatClient", "RawFoundryChatClient", + "evaluate_foundry_target", + "evaluate_traces", ] diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py new file mode 100644 index 0000000000..96b0e1a391 --- /dev/null +++ b/python/packages/core/tests/core/test_local_eval.py @@ -0,0 +1,1028 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Tests for evaluator checks and LocalEvaluator.""" + +from __future__ import annotations + +import inspect + +import pytest + +from agent_framework._evaluation import ( + CheckResult, + EvalItem, + ExpectedToolCall, + LocalEvaluator, + _coerce_result, + evaluator, + keyword_check, + tool_call_args_match, + tool_called_check, + tool_calls_present, +) +from agent_framework._types import Content, Message + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_item( + query: str = "What's the weather in Paris?", + response: str = "It's sunny and 75°F", + expected_output: str | None = None, + conversation: list | None = None, + tools: list | None = None, + context: str | None = None, +) -> EvalItem: + if conversation is None: + conversation = [Message("user", [query]), Message("assistant", [response])] + return EvalItem( + conversation=conversation, + expected_output=expected_output, + tools=tools, + context=context, + ) + + +# --------------------------------------------------------------------------- +# Tier 1: (query, response) -> result +# --------------------------------------------------------------------------- + + +class TestTier1SimpleChecks: + @pytest.mark.asyncio + async def test_bool_return_true(self): + @evaluator + def has_temperature(query: str, response: str) -> bool: + return "°F" in response + + result = await has_temperature(_make_item()) + assert result.passed is True + assert result.check_name == "has_temperature" + + @pytest.mark.asyncio + async def test_bool_return_false(self): + @evaluator + def has_celsius(query: str, response: str) -> bool: + return "°C" in response + + result = await has_celsius(_make_item()) + assert result.passed is False + + @pytest.mark.asyncio + async def test_float_return_passing(self): + @evaluator + def length_score(response: str) -> float: + return min(len(response) / 10, 1.0) + + result = await length_score(_make_item()) + assert result.passed is True + assert "score=" in result.reason + + @pytest.mark.asyncio + async def test_float_return_failing(self): + @evaluator + def always_low(response: str) -> float: + return 0.1 + + result = await always_low(_make_item()) + assert result.passed is False + + @pytest.mark.asyncio + async def test_response_only(self): + """Function with only 'response' param should work.""" + + @evaluator + def is_short(response: str) -> bool: + return len(response) < 1000 + + result = await is_short(_make_item()) + assert result.passed is True + + @pytest.mark.asyncio + async def test_query_only(self): + """Function with only 'query' param should work.""" + + @evaluator + def is_question(query: str) -> bool: + return "?" in query + + result = await is_question(_make_item()) + assert result.passed is True + + +# --------------------------------------------------------------------------- +# Tier 2: (query, response, expected_output) -> result +# --------------------------------------------------------------------------- + + +class TestTier2GroundTruth: + @pytest.mark.asyncio + async def test_exact_match(self): + @evaluator + def exact_match(response: str, expected_output: str) -> bool: + return response.strip() == expected_output.strip() + + item = _make_item(response="42", expected_output="42") + assert (await exact_match(item)).passed is True + + item2 = _make_item(response="43", expected_output="42") + assert (await exact_match(item2)).passed is False + + @pytest.mark.asyncio + async def test_expected_output_defaults_to_empty(self): + """When expected_output is None on the item, it should be passed as ''.""" + + @evaluator + def check_expected(expected_output: str) -> bool: + return expected_output == "" + + result = await check_expected(_make_item(expected_output=None)) + assert result.passed is True + + @pytest.mark.asyncio + async def test_similarity_score(self): + @evaluator + def word_overlap(response: str, expected_output: str) -> float: + r_words = set(response.lower().split()) + e_words = set(expected_output.lower().split()) + if not e_words: + return 1.0 + return len(r_words & e_words) / len(e_words) + + item = _make_item(response="sunny warm day", expected_output="warm sunny afternoon") + result = await word_overlap(item) + assert result.passed is True # 2/3 overlap ≥ 0.5 + + +# --------------------------------------------------------------------------- +# Tier 3: full context (conversation, tools, context) +# --------------------------------------------------------------------------- + + +class TestTier3FullContext: + @pytest.mark.asyncio + async def test_conversation_access(self): + @evaluator + def multi_turn(query: str, response: str, *, conversation: list) -> bool: + return len(conversation) >= 2 + + item = _make_item(conversation=[Message("user", []), Message("assistant", [])]) + assert (await multi_turn(item)).passed is True + + item2 = _make_item(conversation=[Message("user", [])]) + assert (await multi_turn(item2)).passed is False + + @pytest.mark.asyncio + async def test_tools_access(self): + @evaluator + def has_tools(tools: list) -> bool: + return len(tools) > 0 + + mock_tool = type( + "MockTool", + (), + {"name": "get_weather", "description": "Get weather", "parameters": lambda self: {}}, + )() + item = _make_item(tools=[mock_tool]) + assert (await has_tools(item)).passed is True + + @pytest.mark.asyncio + async def test_context_access(self): + @evaluator + def grounded(response: str, context: str) -> bool: + if not context: + return True + return any(word in response.lower() for word in context.lower().split()) + + item = _make_item(response="It's sunny", context="sunny warm") + assert (await grounded(item)).passed is True + + @pytest.mark.asyncio + async def test_all_params(self): + @evaluator + def full_check( + query: str, + response: str, + expected_output: str, + conversation: list, + tools: list, + context: str, + ) -> bool: + return all([query, response, expected_output is not None, isinstance(conversation, list)]) + + item = _make_item(expected_output="foo", context="bar") + assert (await full_check(item)).passed is True + + +# --------------------------------------------------------------------------- +# Return type coercion +# --------------------------------------------------------------------------- + + +class TestReturnTypeCoercion: + @pytest.mark.asyncio + async def test_dict_with_score(self): + @evaluator + def scored(response: str) -> dict: + return {"score": 0.9, "reason": "good answer"} + + result = await scored(_make_item()) + assert result.passed is True + assert result.reason == "good answer" + + @pytest.mark.asyncio + async def test_dict_with_score_below_threshold(self): + @evaluator + def low_scored(response: str) -> dict: + return {"score": 0.3} + + result = await low_scored(_make_item()) + assert result.passed is False + + @pytest.mark.asyncio + async def test_dict_with_custom_threshold(self): + @evaluator + def custom_threshold(response: str) -> dict: + return {"score": 0.3, "threshold": 0.2} + + result = await custom_threshold(_make_item()) + assert result.passed is True + + @pytest.mark.asyncio + async def test_dict_with_passed(self): + @evaluator + def explicit_pass(response: str) -> dict: + return {"passed": True, "reason": "all good"} + + result = await explicit_pass(_make_item()) + assert result.passed is True + assert result.reason == "all good" + + @pytest.mark.asyncio + async def test_check_result_passthrough(self): + @evaluator + def returns_check_result(response: str) -> CheckResult: + return CheckResult(True, "direct result", "custom") + + result = await returns_check_result(_make_item()) + assert result.passed is True + assert result.reason == "direct result" + assert result.check_name == "custom" + + @pytest.mark.asyncio + async def test_unsupported_return_type(self): + @evaluator + def bad_return(response: str) -> str: + return "oops" + + with pytest.raises(TypeError, match="unsupported type"): + await bad_return(_make_item()) + + @pytest.mark.asyncio + async def test_int_return(self): + @evaluator + def int_score(response: str) -> int: + return 1 + + result = await int_score(_make_item()) + assert result.passed is True + + +# --------------------------------------------------------------------------- +# Decorator variants +# --------------------------------------------------------------------------- + + +class TestDecoratorVariants: + @pytest.mark.asyncio + async def test_decorator_no_parens(self): + @evaluator + def my_check(response: str) -> bool: + return True + + assert (await my_check(_make_item())).passed is True + + @pytest.mark.asyncio + async def test_decorator_with_name(self): + @evaluator(name="custom_name") + def my_check(response: str) -> bool: + return True + + assert my_check.__name__ == "custom_name" + result = await my_check(_make_item()) + assert result.check_name == "custom_name" + + @pytest.mark.asyncio + async def test_direct_call(self): + def raw_fn(query: str, response: str) -> bool: + return len(response) > 0 + + check = evaluator(raw_fn, name="direct") + result = await check(_make_item()) + assert result.passed is True + assert result.check_name == "direct" + + +# --------------------------------------------------------------------------- +# Error handling +# --------------------------------------------------------------------------- + + +class TestErrorHandling: + @pytest.mark.asyncio + async def test_unknown_required_param_raises(self): + with pytest.raises(TypeError, match="unknown required parameter"): + + @evaluator + def bad_params(query: str, unknown_param: str) -> bool: + return True + + @pytest.mark.asyncio + async def test_unknown_optional_param_ok(self): + @evaluator + def optional_unknown(query: str, foo: str = "default") -> bool: + return foo == "default" + + result = await optional_unknown(_make_item()) + assert result.passed is True + + @pytest.mark.asyncio + async def test_async_function_works_with_evaluator(self): + """Using an async function with @evaluator should work.""" + + @evaluator + async def async_fn(response: str) -> bool: + return True + + result = async_fn(_make_item()) + # Should return an awaitable + assert inspect.isawaitable(result) + check_result = await result + assert check_result.passed is True + + +# --------------------------------------------------------------------------- +# Integration with LocalEvaluator +# --------------------------------------------------------------------------- + + +class TestLocalEvaluatorIntegration: + @pytest.mark.asyncio + async def test_mixed_checks(self): + """Function evaluators mix with built-in checks in LocalEvaluator.""" + + @evaluator + def length_ok(response: str) -> bool: + return len(response) > 5 + + local = LocalEvaluator( + keyword_check("sunny"), + length_ok, + ) + items = [_make_item()] + results = await local.evaluate(items, eval_name="mixed test") + + assert results.status == "completed" + assert results.result_counts["passed"] == 1 + assert results.result_counts["failed"] == 0 + + @pytest.mark.asyncio + async def test_evaluator_failure_counted(self): + @evaluator + def always_fail(response: str) -> bool: + return False + + local = LocalEvaluator(always_fail) + results = await local.evaluate([_make_item()]) + + assert results.result_counts["failed"] == 1 + + @pytest.mark.asyncio + async def test_multiple_evaluators(self): + @evaluator + def check_a(response: str) -> float: + return 0.9 + + @evaluator + def check_b(query: str, response: str, expected_output: str) -> bool: + return True + + @evaluator(name="check_c") + def check_c(response: str, conversation: list) -> dict: + return {"score": 0.8, "reason": "looks good"} + + local = LocalEvaluator(check_a, check_b, check_c) + results = await local.evaluate([_make_item(expected_output="test")]) + + assert results.result_counts["passed"] == 1 + assert "check_a" in results.per_evaluator + assert "check_b" in results.per_evaluator + assert "check_c" in results.per_evaluator + + +# --------------------------------------------------------------------------- +# Async evaluator (via @evaluator which handles async automatically) +# --------------------------------------------------------------------------- + + +class TestAsyncFunctionEvaluator: + @pytest.mark.asyncio + async def test_async_evaluator_in_local(self): + @evaluator + async def async_check(query: str, response: str) -> bool: + return len(response) > 0 + + local = LocalEvaluator(async_check) + results = await local.evaluate([_make_item()]) + assert results.result_counts["passed"] == 1 + + @pytest.mark.asyncio + async def test_async_with_name(self): + @evaluator(name="named_async") + async def my_async(response: str) -> float: + return 0.75 + + result = await my_async(_make_item()) + assert result.passed is True + assert result.check_name == "named_async" + + +# --------------------------------------------------------------------------- +# Auto-wrapping bare checks in evaluate_agent +# --------------------------------------------------------------------------- + + +class TestAutoWrapEvalChecks: + @pytest.mark.asyncio + async def test_bare_check_in_evaluators_list(self): + """Bare EvalCheck callables are auto-wrapped in LocalEvaluator.""" + from agent_framework._evaluation import _run_evaluators + + @evaluator + def is_long(response: str) -> bool: + return len(response.split()) > 2 + + items = [_make_item(response="It is sunny and warm today")] + results = await _run_evaluators(is_long, items, eval_name="test") + assert len(results) == 1 + assert results[0].result_counts["passed"] == 1 + + @pytest.mark.asyncio + async def test_mixed_evaluators_and_checks(self): + """Mix of Evaluator instances and bare checks works.""" + from agent_framework._evaluation import _run_evaluators + + @evaluator + def has_words(response: str) -> bool: + return len(response.split()) > 0 + + local = LocalEvaluator(keyword_check("sunny")) + + items = [_make_item(response="It is sunny")] + results = await _run_evaluators([local, has_words], items, eval_name="test") + assert len(results) == 2 + assert all(r.result_counts["passed"] == 1 for r in results) + + @pytest.mark.asyncio + async def test_adjacent_checks_grouped(self): + """Adjacent bare checks are grouped into a single LocalEvaluator.""" + from agent_framework._evaluation import _run_evaluators + + @evaluator + def check_a(response: str) -> bool: + return True + + @evaluator + def check_b(response: str) -> bool: + return True + + items = [_make_item()] + results = await _run_evaluators([check_a, check_b], items, eval_name="test") + # Two adjacent checks → one LocalEvaluator → one result + assert len(results) == 1 + assert results[0].result_counts["passed"] == 1 + + +# --------------------------------------------------------------------------- +# Expected Tool Calls +# --------------------------------------------------------------------------- + + +def _make_tool_call_item( + calls: list[tuple[str, dict | None]], + expected: list[ExpectedToolCall] | None = None, +) -> EvalItem: + """Build an EvalItem with tool calls in the conversation.""" + msgs: list[Message] = [Message("user", ["Do something"])] + for name, args in calls: + msgs.append(Message("assistant", [Content.from_function_call("call_" + name, name, arguments=args)])) + msgs.append(Message("assistant", ["Done"])) + return EvalItem(conversation=msgs, expected_tool_calls=expected) + + +class TestExpectedToolCallType: + def test_name_only(self): + tc = ExpectedToolCall("get_weather") + assert tc.name == "get_weather" + assert tc.arguments is None + + def test_name_and_args(self): + tc = ExpectedToolCall("get_weather", {"location": "NYC"}) + assert tc.name == "get_weather" + assert tc.arguments == {"location": "NYC"} + + +class TestToolCallsPresent: + def test_all_present(self): + item = _make_tool_call_item( + calls=[("get_weather", None), ("get_news", None)], + expected=[ExpectedToolCall("get_weather"), ExpectedToolCall("get_news")], + ) + result = tool_calls_present(item) + assert result.passed is True + assert result.check_name == "tool_calls_present" + + def test_missing_tool(self): + item = _make_tool_call_item( + calls=[("get_weather", None)], + expected=[ExpectedToolCall("get_weather"), ExpectedToolCall("get_news")], + ) + result = tool_calls_present(item) + assert result.passed is False + assert "get_news" in result.reason + + def test_extras_ok(self): + item = _make_tool_call_item( + calls=[("get_weather", None), ("get_news", None), ("get_stock", None)], + expected=[ExpectedToolCall("get_weather")], + ) + result = tool_calls_present(item) + assert result.passed is True + + def test_no_expected(self): + item = _make_tool_call_item(calls=[("get_weather", None)]) + result = tool_calls_present(item) + assert result.passed is True + assert "No expected" in result.reason + + +class TestToolCallArgsMatch: + def test_name_only_match(self): + item = _make_tool_call_item( + calls=[("get_weather", {"location": "NYC"})], + expected=[ExpectedToolCall("get_weather")], + ) + result = tool_call_args_match(item) + assert result.passed is True + + def test_args_exact_match(self): + item = _make_tool_call_item( + calls=[("get_weather", {"location": "NYC", "units": "fahrenheit"})], + expected=[ExpectedToolCall("get_weather", {"location": "NYC"})], + ) + # Subset match — extra "units" key is OK + result = tool_call_args_match(item) + assert result.passed is True + + def test_args_mismatch(self): + item = _make_tool_call_item( + calls=[("get_weather", {"location": "LA"})], + expected=[ExpectedToolCall("get_weather", {"location": "NYC"})], + ) + result = tool_call_args_match(item) + assert result.passed is False + assert "args mismatch" in result.reason + + def test_tool_not_called(self): + item = _make_tool_call_item( + calls=[("get_news", None)], + expected=[ExpectedToolCall("get_weather", {"location": "NYC"})], + ) + result = tool_call_args_match(item) + assert result.passed is False + assert "not called" in result.reason + + def test_multiple_expected(self): + item = _make_tool_call_item( + calls=[ + ("get_weather", {"location": "NYC"}), + ("book_flight", {"destination": "LA", "date": "tomorrow"}), + ], + expected=[ + ExpectedToolCall("get_weather", {"location": "NYC"}), + ExpectedToolCall("book_flight", {"destination": "LA"}), + ], + ) + result = tool_call_args_match(item) + assert result.passed is True + + def test_no_expected(self): + item = _make_tool_call_item(calls=[("get_weather", None)]) + result = tool_call_args_match(item) + assert result.passed is True + + +class TestExpectedToolCallsFieldInjection: + """Test that @evaluator can receive expected_tool_calls via parameter injection.""" + + @pytest.mark.asyncio + async def test_injection(self): + @evaluator + def check_tools(expected_tool_calls: list) -> bool: + return len(expected_tool_calls) == 2 + + item = _make_tool_call_item( + calls=[], + expected=[ExpectedToolCall("a"), ExpectedToolCall("b")], + ) + result = await check_tools(item) + assert result.passed is True + + @pytest.mark.asyncio + async def test_injection_empty_default(self): + @evaluator + def check_tools(expected_tool_calls: list) -> bool: + return len(expected_tool_calls) == 0 + + item = _make_tool_call_item(calls=[]) + result = await check_tools(item) + assert result.passed is True + + +# --------------------------------------------------------------------------- +# Per-item results (auditing) +# --------------------------------------------------------------------------- + + +class TestPerItemResults: + """LocalEvaluator should produce per-item EvalItemResult with query/response.""" + + @pytest.mark.asyncio + async def test_items_populated_with_query_and_response(self): + @evaluator + def is_sunny(response: str) -> bool: + return "sunny" in response.lower() + + item = _make_item(query="Weather?", response="It's sunny!") + local = LocalEvaluator(is_sunny) + results = await local.evaluate([item]) + + assert len(results.items) == 1 + ri = results.items[0] + assert ri.item_id == "0" + assert ri.status == "pass" + assert ri.input_text == "Weather?" + assert ri.output_text == "It's sunny!" + assert len(ri.scores) == 1 + assert ri.scores[0].name == "is_sunny" + assert ri.scores[0].passed is True + + @pytest.mark.asyncio + async def test_items_populated_on_failure(self): + @evaluator + def always_fail(response: str) -> bool: + return False + + item = _make_item(query="Hello", response="World") + local = LocalEvaluator(always_fail) + results = await local.evaluate([item]) + + assert len(results.items) == 1 + ri = results.items[0] + assert ri.status == "fail" + assert ri.input_text == "Hello" + assert ri.output_text == "World" + assert ri.scores[0].passed is False + assert ri.scores[0].score == 0.0 + + @pytest.mark.asyncio + async def test_multiple_items_indexed(self): + @evaluator + def pass_all(response: str) -> bool: + return True + + items = [ + _make_item(query="Q1", response="R1"), + _make_item(query="Q2", response="R2"), + ] + local = LocalEvaluator(pass_all) + results = await local.evaluate(items) + + assert len(results.items) == 2 + assert results.items[0].item_id == "0" + assert results.items[0].input_text == "Q1" + assert results.items[0].output_text == "R1" + assert results.items[1].item_id == "1" + assert results.items[1].input_text == "Q2" + assert results.items[1].output_text == "R2" + + +# --------------------------------------------------------------------------- +# num_repetitions validation +# --------------------------------------------------------------------------- + + +class TestNumRepetitions: + """Tests for the num_repetitions parameter on evaluate_agent.""" + + @pytest.mark.asyncio + async def test_num_repetitions_validation_rejects_zero(self): + from agent_framework._evaluation import evaluate_agent + + with pytest.raises(ValueError, match="num_repetitions must be >= 1"): + await evaluate_agent( + queries=["Hello"], + evaluators=LocalEvaluator(keyword_check("hello")), + num_repetitions=0, + ) + + @pytest.mark.asyncio + async def test_num_repetitions_validation_rejects_negative(self): + from agent_framework._evaluation import evaluate_agent + + with pytest.raises(ValueError, match="num_repetitions must be >= 1"): + await evaluate_agent( + queries=["Hello"], + evaluators=LocalEvaluator(keyword_check("hello")), + num_repetitions=-1, + ) + + @pytest.mark.asyncio + async def test_num_repetitions_multiplies_items(self): + """num_repetitions=2 produces 2× the eval items.""" + from unittest.mock import AsyncMock, MagicMock + + from agent_framework._evaluation import evaluate_agent + from agent_framework._types import AgentResponse, Message + + mock_agent = MagicMock() + mock_agent.name = "test" + mock_agent.default_options = {} + mock_agent.run = AsyncMock(return_value=AgentResponse(messages=[Message("assistant", ["reply"])])) + + results = await evaluate_agent( + agent=mock_agent, + queries=["Q1", "Q2"], + evaluators=LocalEvaluator(keyword_check("reply")), + num_repetitions=2, + ) + # 2 queries × 2 reps = 4 items + assert results[0].total == 4 + assert mock_agent.run.call_count == 4 + + @pytest.mark.asyncio + async def test_num_repetitions_with_expected_output(self): + """num_repetitions > 1 correctly stamps expected_output via modulo.""" + from unittest.mock import AsyncMock, MagicMock + + from agent_framework._evaluation import evaluate_agent + from agent_framework._types import AgentResponse, Message + + mock_agent = MagicMock() + mock_agent.name = "test" + mock_agent.default_options = {} + mock_agent.run = AsyncMock(return_value=AgentResponse(messages=[Message("assistant", ["reply"])])) + + @evaluator + def check_expected(response: str, expected_output: str) -> dict: + return {"passed": expected_output in ("A", "B"), "reason": f"expected={expected_output}"} + + results = await evaluate_agent( + agent=mock_agent, + queries=["Q1", "Q2"], + expected_output=["A", "B"], + evaluators=LocalEvaluator(check_expected), + num_repetitions=2, + ) + # 2 queries × 2 reps = 4 items, all should pass + assert results[0].total == 4 + assert results[0].passed == 4 + + @pytest.mark.asyncio + async def test_num_repetitions_with_expected_tool_calls(self): + """num_repetitions > 1 correctly stamps expected_tool_calls via modulo.""" + from unittest.mock import AsyncMock, MagicMock + + from agent_framework._evaluation import evaluate_agent + from agent_framework._types import AgentResponse, Content, Message + + mock_agent = MagicMock() + mock_agent.name = "test" + mock_agent.default_options = {} + mock_agent.run = AsyncMock( + return_value=AgentResponse( + messages=[ + Message( + "assistant", + [Content.from_function_call("c1", "get_weather", arguments={"location": "NYC"})], + ), + Message("tool", [Content.from_function_result("c1", result="Sunny")]), + Message("assistant", ["It's sunny"]), + ] + ) + ) + + results = await evaluate_agent( + agent=mock_agent, + queries=["Q1"], + expected_tool_calls=[[ExpectedToolCall("get_weather")]], + evaluators=LocalEvaluator(tool_calls_present), + num_repetitions=2, + ) + # 1 query × 2 reps = 2 items + assert results[0].total == 2 + assert results[0].passed == 2 + + +# --------------------------------------------------------------------------- +# r3 review: additional test coverage +# --------------------------------------------------------------------------- + + +class TestToolCalledCheckModeAny: + """Tests for tool_called_check with mode='any'.""" + + async def test_any_mode_one_tool_called(self): + """mode='any' passes when at least one expected tool is called.""" + item = _make_item( + conversation=[ + Message("user", ["Do something"]), + Message("assistant", [Content.from_function_call("c1", "tool_a", arguments={})]), + Message("tool", [Content.from_function_result("c1", result="ok")]), + Message("assistant", ["Done"]), + ] + ) + check = tool_called_check("tool_a", "tool_b", mode="any") + result = check(item) + assert result.passed is True + + async def test_any_mode_none_called(self): + """mode='any' fails when no expected tools are called.""" + item = _make_item( + conversation=[ + Message("user", ["Do something"]), + Message("assistant", ["I can't use tools"]), + ] + ) + check = tool_called_check("tool_a", "tool_b", mode="any") + result = check(item) + assert result.passed is False + assert "None of expected tools" in result.reason + + +class TestCoerceResultScoreError: + """Tests for _coerce_result handling non-numeric score.""" + + def test_non_numeric_score_raises(self): + """Dict with non-numeric score raises TypeError.""" + with pytest.raises(TypeError, match="non-numeric 'score'"): + _coerce_result({"score": "high"}, "test_check") + + def test_none_score_raises(self): + with pytest.raises(TypeError, match="non-numeric 'score'"): + _coerce_result({"score": None}, "test_check") + + +class TestBareCheckViaEvaluateAgent: + """Test bare callable check functions through the public evaluate_agent API.""" + + async def test_bare_check_through_evaluate_agent(self): + from unittest.mock import AsyncMock, MagicMock + + from agent_framework._evaluation import evaluate_agent + from agent_framework._types import AgentResponse + + mock_agent = MagicMock() + mock_agent.name = "test" + mock_agent.default_options = {} + mock_agent.run = AsyncMock( + return_value=AgentResponse(messages=[Message("assistant", ["The weather is sunny"])]) + ) + + is_long = keyword_check("weather") + + results = await evaluate_agent( + agent=mock_agent, + queries=["Q"], + evaluators=is_long, + ) + assert results[0].total == 1 + assert results[0].passed == 1 + + +class TestEvaluateAgentModuloWrapping: + """Test that expected_output stamps correctly with num_repetitions > 1 and multiple queries.""" + + async def test_modulo_stamps_correct_expected_output(self): + from unittest.mock import AsyncMock, MagicMock + + from agent_framework._evaluation import evaluate_agent + from agent_framework._types import AgentResponse + + mock_agent = MagicMock() + mock_agent.name = "test" + mock_agent.default_options = {} + mock_agent.run = AsyncMock(return_value=AgentResponse(messages=[Message("assistant", ["reply"])])) + + # Track which expected_output each item gets + seen_expected: list[str] = [] + + @evaluator + def capture_expected(response: str, expected_output: str) -> dict: + seen_expected.append(expected_output) + return {"passed": True, "reason": "ok"} + + await evaluate_agent( + agent=mock_agent, + queries=["Q1", "Q2", "Q3"], + expected_output=["A", "B", "C"], + evaluators=LocalEvaluator(capture_expected), + num_repetitions=2, + ) + # 3 queries × 2 reps = 6 items; modulo wrapping: A,B,C,A,B,C + assert seen_expected == ["A", "B", "C", "A", "B", "C"] + + +class TestEvaluateAgentQueriesWithoutAgent: + """Test error message when queries provided without agent.""" + + async def test_queries_without_agent_gives_clear_error(self): + from agent_framework._evaluation import evaluate_agent + + with pytest.raises(ValueError, match="Provide 'agent' when using 'queries'"): + await evaluate_agent( + queries=["hello"], + evaluators=LocalEvaluator(keyword_check("x")), + ) + + +# --------------------------------------------------------------------------- +# r5 review: all_passed with result_counts=None + sub_results +# --------------------------------------------------------------------------- + + +class TestAllPassedSubResults: + """Tests for EvalResults.all_passed with sub_results.""" + + def test_all_passed_ignores_own_counts_when_none(self): + """When result_counts is None (aggregate), all_passed delegates to sub_results.""" + from agent_framework._evaluation import EvalResults + + sub_pass = EvalResults( + provider="Local", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 2, "failed": 0, "errored": 0}, + ) + parent = EvalResults( + provider="Local", + eval_id="e0", + run_id="r0", + status="completed", + result_counts=None, + sub_results={"agent1": sub_pass}, + ) + assert parent.all_passed is True + + def test_all_passed_parent_fails_when_own_counts_fail(self): + """When parent has result_counts with failures, all_passed is False even if sub_results pass.""" + from agent_framework._evaluation import EvalResults + + sub_pass = EvalResults( + provider="Local", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 2, "failed": 0, "errored": 0}, + ) + parent = EvalResults( + provider="Local", + eval_id="e0", + run_id="r0", + status="completed", + result_counts={"passed": 1, "failed": 1, "errored": 0}, + sub_results={"agent1": sub_pass}, + ) + assert parent.all_passed is False + + +# --------------------------------------------------------------------------- +# r5 review: _build_overall_item with empty outputs +# --------------------------------------------------------------------------- + + +class TestBuildOverallItemEmpty: + """Test _build_overall_item returns None for empty workflow outputs.""" + + def test_returns_none_for_empty_outputs(self): + from unittest.mock import MagicMock + + from agent_framework._evaluation import _build_overall_item + + mock_result = MagicMock() + mock_result.get_outputs.return_value = [] + item = _build_overall_item("Hello", mock_result) + assert item is None diff --git a/python/packages/foundry/agent_framework_foundry/__init__.py b/python/packages/foundry/agent_framework_foundry/__init__.py index 50c500ad4e..a67b5df801 100644 --- a/python/packages/foundry/agent_framework_foundry/__init__.py +++ b/python/packages/foundry/agent_framework_foundry/__init__.py @@ -4,6 +4,11 @@ from ._agent import FoundryAgent, RawFoundryAgent, RawFoundryAgentChatClient from ._chat_client import FoundryChatClient, FoundryChatOptions, RawFoundryChatClient +from ._foundry_evals import ( + FoundryEvals, + evaluate_foundry_target, + evaluate_traces, +) from ._memory_provider import FoundryMemoryProvider try: @@ -15,9 +20,12 @@ "FoundryAgent", "FoundryChatClient", "FoundryChatOptions", + "FoundryEvals", "FoundryMemoryProvider", "RawFoundryAgent", "RawFoundryAgentChatClient", "RawFoundryChatClient", "__version__", + "evaluate_foundry_target", + "evaluate_traces", ] diff --git a/python/packages/foundry/agent_framework_foundry/_foundry_evals.py b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py new file mode 100644 index 0000000000..9762eb158b --- /dev/null +++ b/python/packages/foundry/agent_framework_foundry/_foundry_evals.py @@ -0,0 +1,901 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Microsoft Foundry Evals integration for Microsoft Agent Framework. + +Provides ``FoundryEvals``, an ``Evaluator`` implementation backed by Azure AI +Foundry's built-in evaluators. See docs/decisions/0018-foundry-evals-integration.md +for the design rationale. + +Example: + +.. code-block:: python + + from agent_framework import evaluate_agent + from agent_framework.foundry import FoundryEvals + + # Zero-config: reads FOUNDRY_PROJECT_ENDPOINT and FOUNDRY_MODEL from env + evals = FoundryEvals() + results = await evaluate_agent( + agent=my_agent, + queries=["What's the weather in Seattle?"], + evaluators=evals, + ) + results[0].raise_for_status() + print(results[0].report_url) +""" + +from __future__ import annotations + +import asyncio +import logging +from collections.abc import Sequence +from typing import TYPE_CHECKING, Any, cast + +from agent_framework._evaluation import ( + AgentEvalConverter, + ConversationSplit, + ConversationSplitter, + EvalItem, + EvalItemResult, + EvalResults, + EvalScoreResult, +) +from openai import AsyncOpenAI + +from ._chat_client import FoundryChatClient + +if TYPE_CHECKING: + from azure.ai.projects.aio import AIProjectClient + from openai.types.evals import RunRetrieveResponse + +logger = logging.getLogger(__name__) + +# Agent evaluators that accept query/response as conversation arrays. +# Maintained manually — check https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk +# for the latest evaluator list. These are the evaluators that need conversation-format input. +_AGENT_EVALUATORS: set[str] = { + "builtin.intent_resolution", + "builtin.task_adherence", + "builtin.task_completion", + "builtin.task_navigation_efficiency", + "builtin.tool_call_accuracy", + "builtin.tool_selection", + "builtin.tool_input_accuracy", + "builtin.tool_output_utilization", + "builtin.tool_call_success", +} + +# Evaluators that additionally require tool_definitions. +_TOOL_EVALUATORS: set[str] = { + "builtin.tool_call_accuracy", + "builtin.tool_selection", + "builtin.tool_input_accuracy", + "builtin.tool_output_utilization", + "builtin.tool_call_success", +} + +_BUILTIN_EVALUATORS: dict[str, str] = { + # Agent behavior + "intent_resolution": "builtin.intent_resolution", + "task_adherence": "builtin.task_adherence", + "task_completion": "builtin.task_completion", + "task_navigation_efficiency": "builtin.task_navigation_efficiency", + # Tool usage + "tool_call_accuracy": "builtin.tool_call_accuracy", + "tool_selection": "builtin.tool_selection", + "tool_input_accuracy": "builtin.tool_input_accuracy", + "tool_output_utilization": "builtin.tool_output_utilization", + "tool_call_success": "builtin.tool_call_success", + # Quality + "coherence": "builtin.coherence", + "fluency": "builtin.fluency", + "relevance": "builtin.relevance", + "groundedness": "builtin.groundedness", + "response_completeness": "builtin.response_completeness", + "similarity": "builtin.similarity", + # Safety + "violence": "builtin.violence", + "sexual": "builtin.sexual", + "self_harm": "builtin.self_harm", + "hate_unfairness": "builtin.hate_unfairness", +} + +# Default evaluator sets used when evaluators=None +_DEFAULT_EVALUATORS: list[str] = [ + "relevance", + "coherence", + "task_adherence", +] + +_DEFAULT_TOOL_EVALUATORS: list[str] = [ + "tool_call_accuracy", +] + +# Consistency between evaluator sets is enforced by tests in +# test_foundry_evals.py — see TestEvaluatorSetConsistency. + + +def _resolve_evaluator(name: str) -> str: + """Resolve a short evaluator name to its fully-qualified ``builtin.*`` form. + + Args: + name: Short name (e.g. ``"relevance"``) or fully-qualified name + (e.g. ``"builtin.relevance"``). + + Returns: + The fully-qualified evaluator name. + + Raises: + ValueError: If the name is not recognized. + """ + if name.startswith("builtin."): + # Already fully-qualified — pass through as-is. + # We don't validate the specific name because Foundry may add + # new evaluators that aren't in our local mapping. + return name + resolved = _BUILTIN_EVALUATORS.get(name) + if resolved is None: + raise ValueError(f"Unknown evaluator '{name}'. Available: {sorted(_BUILTIN_EVALUATORS)}") + return resolved + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _build_testing_criteria( + evaluators: Sequence[str], + model: str, + *, + include_data_mapping: bool = False, +) -> list[dict[str, Any]]: + """Build ``testing_criteria`` for ``evals.create()``. + + Args: + evaluators: Evaluator names. + model: Model deployment for the LLM judge. + include_data_mapping: Whether to include field-level data mapping + (required for the JSONL data source, not needed for response-based). + """ + criteria: list[dict[str, Any]] = [] + for name in evaluators: + qualified = _resolve_evaluator(name) + short = name if not name.startswith("builtin.") else name.split(".")[-1] + + # Structure dictated by the OpenAI evals API — see + # https://platform.openai.com/docs/api-reference/evals/create + entry: dict[str, Any] = { + "type": "azure_ai_evaluator", + "name": short, + "evaluator_name": qualified, + "initialization_parameters": {"deployment_name": model}, + } + + if include_data_mapping: + if qualified in _AGENT_EVALUATORS: + # Agent evaluators: query/response as conversation arrays. + # {{item.*}} are Mustache-style placeholders resolved by the + # evals API against fields in the JSONL data items. + mapping: dict[str, str] = { + "query": "{{item.query_messages}}", + "response": "{{item.response_messages}}", + } + else: + # Quality evaluators: query/response as strings + mapping = { + "query": "{{item.query}}", + "response": "{{item.response}}", + } + if qualified == "builtin.groundedness": + mapping["context"] = "{{item.context}}" + if qualified in _TOOL_EVALUATORS: + mapping["tool_definitions"] = "{{item.tool_definitions}}" + entry["data_mapping"] = mapping + + criteria.append(entry) + return criteria + + +def _build_item_schema(*, has_context: bool = False, has_tools: bool = False) -> dict[str, Any]: + """Build the ``item_schema`` for custom JSONL eval definitions.""" + properties: dict[str, Any] = { + "query": {"type": "string"}, + "response": {"type": "string"}, + "query_messages": {"type": "array"}, + "response_messages": {"type": "array"}, + } + if has_context: + properties["context"] = {"type": "string"} + if has_tools: + properties["tool_definitions"] = {"type": "array"} + return { + "type": "object", + "properties": properties, + "required": ["query", "response"], + } + + +def _resolve_default_evaluators( + evaluators: Sequence[str] | None, + items: Sequence[EvalItem | dict[str, Any]] | None = None, +) -> list[str]: + """Resolve evaluators, applying defaults when ``None``. + + Defaults to relevance + coherence + task_adherence. Automatically adds + tool_call_accuracy when items contain tools. + """ + if evaluators is not None: + return list(evaluators) + + result = list(_DEFAULT_EVALUATORS) + if items is not None: + has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items) + if has_tools: + result.extend(_DEFAULT_TOOL_EVALUATORS) + return result + + +def _filter_tool_evaluators( + evaluators: list[str], + items: Sequence[EvalItem | dict[str, Any]], +) -> list[str]: + """Remove tool evaluators if no items have tool definitions.""" + has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items) + if has_tools: + return evaluators + filtered = [e for e in evaluators if _resolve_evaluator(e) not in _TOOL_EVALUATORS] + if not filtered: + raise ValueError( + f"All requested evaluators {evaluators} require tool definitions, " + "but no items have tools. Either add tool definitions to your items " + "or choose evaluators that do not require tools." + ) + if len(filtered) < len(evaluators): + removed = [e for e in evaluators if _resolve_evaluator(e) in _TOOL_EVALUATORS] + logger.info("Removed tool evaluators %s (no items have tools)", removed) + return filtered + + +async def _poll_eval_run( + client: AsyncOpenAI, + eval_id: str, + run_id: str, + poll_interval: float = 5.0, + timeout: float = 180.0, + provider: str = "Microsoft Foundry", + *, + fetch_output_items: bool = True, +) -> EvalResults: + """Poll an eval run until completion or timeout.""" + loop = asyncio.get_running_loop() + deadline = loop.time() + timeout + while True: + run = await client.evals.runs.retrieve(run_id=run_id, eval_id=eval_id) + if run.status in ("completed", "failed", "canceled"): + error_msg = None + if run.status == "failed": + # run.error is an EvalAPIError object (code + message) + err = run.error + if err is not None: # pyright: ignore[reportUnnecessaryComparison] + error_msg = getattr(err, "message", None) or str(err) + + items: list[EvalItemResult] = [] + if fetch_output_items and run.status == "completed": + items = await _fetch_output_items(client, eval_id, run_id) + + return EvalResults( + provider=provider, + eval_id=eval_id, + run_id=run_id, + status=run.status, + result_counts=_extract_result_counts(run), + report_url=run.report_url, + error=error_msg, + per_evaluator=_extract_per_evaluator(run), + items=items, + ) + remaining = deadline - loop.time() + if remaining <= 0: + return EvalResults(provider=provider, eval_id=eval_id, run_id=run_id, status="timeout") + logger.debug("Eval run %s status: %s (%.0fs remaining)", run_id, run.status, remaining) + await asyncio.sleep(min(poll_interval, remaining)) + + +def _extract_result_counts(run: RunRetrieveResponse) -> dict[str, int] | None: + """Extract result_counts from an eval run as a plain dict.""" + counts = getattr(run, "result_counts", None) + if counts is None: + return None + if isinstance(counts, dict): + return cast(dict[str, int], counts) + # ResultCounts is a Pydantic model with errored/failed/passed/total fields + result: dict[str, int] = {} + for attr in ("errored", "failed", "passed", "total"): + val = getattr(counts, attr, None) + if isinstance(val, int): + result[attr] = val + return result or None + + +def _extract_per_evaluator(run: RunRetrieveResponse) -> dict[str, dict[str, int]]: + """Extract per-evaluator result breakdowns from an eval run.""" + per_eval: dict[str, dict[str, int]] = {} + per_testing_criteria = getattr(run, "per_testing_criteria_results", None) + if per_testing_criteria is None: + return per_eval + # PerTestingCriteriaResult has testing_criteria (str), passed (int), failed (int) + for item in per_testing_criteria: + name = str(getattr(item, "testing_criteria", None) or getattr(item, "name", "unknown")) + passed = getattr(item, "passed", None) + failed = getattr(item, "failed", None) + if name and isinstance(passed, int) and isinstance(failed, int): + per_eval[name] = {"passed": passed, "failed": failed} + return per_eval + + +async def _fetch_output_items( + client: AsyncOpenAI, + eval_id: str, + run_id: str, +) -> list[EvalItemResult]: + """Fetch per-item results from the output_items API. + + Converts the provider-specific ``OutputItemListResponse`` objects into + provider-agnostic ``EvalItemResult`` instances with per-evaluator scores, + error categorization, and token usage. Uses async pagination to handle + eval runs with more items than a single page. + """ + items: list[EvalItemResult] = [] + try: + output_items_page = await client.evals.runs.output_items.list( + run_id=run_id, + eval_id=eval_id, + ) + + async for oi in output_items_page: + item_id = getattr(oi, "id", "") or "" + status = getattr(oi, "status", "unknown") or "unknown" + + # Extract per-evaluator scores + scores: list[EvalScoreResult] = [] + for r in getattr(oi, "results", []) or []: + scores.append( + EvalScoreResult( + name=getattr(r, "name", "unknown"), + score=getattr(r, "score", 0.0), + passed=getattr(r, "passed", None), + sample=getattr(r, "sample", None), + ) + ) + + # Extract error info from sample + error_code: str | None = None + error_message: str | None = None + token_usage: dict[str, int] | None = None + input_text: str | None = None + output_text: str | None = None + response_id: str | None = None + + sample = getattr(oi, "sample", None) + if sample is not None: + error = getattr(sample, "error", None) + if error is not None: + code = getattr(error, "code", None) + msg = getattr(error, "message", None) + if code or msg: + error_code = code or None + error_message = msg or None + + usage = getattr(sample, "usage", None) + if usage is not None: + total = getattr(usage, "total_tokens", 0) + if total: + token_usage = { + "prompt_tokens": getattr(usage, "prompt_tokens", 0), + "completion_tokens": getattr(usage, "completion_tokens", 0), + "total_tokens": total, + "cached_tokens": getattr(usage, "cached_tokens", 0), + } + + # Extract input/output text + sample_input = getattr(sample, "input", None) + if sample_input: + parts = [getattr(si, "content", "") for si in sample_input if getattr(si, "role", "") == "user"] + if parts: + input_text = " ".join(parts) + + sample_output = getattr(sample, "output", None) + if sample_output: + parts = [ + getattr(so, "content", "") or "" + for so in sample_output + if getattr(so, "role", "") == "assistant" + ] + if parts: + output_text = " ".join(parts) + + # Extract response_id from datasource_item + ds_item = getattr(oi, "datasource_item", None) + if ds_item and isinstance(ds_item, dict): + ds_dict = cast(dict[str, Any], ds_item) + resp_id_val = ds_dict.get("resp_id") or ds_dict.get("response_id") + response_id = str(resp_id_val) if resp_id_val else None + + items.append( + EvalItemResult( + item_id=item_id, + status=status, + scores=scores, + error_code=error_code, + error_message=error_message, + response_id=response_id, + input_text=input_text, + output_text=output_text, + token_usage=token_usage, + ) + ) + except (AttributeError, KeyError, TypeError): + logger.warning("Could not fetch output_items for run %s", run_id, exc_info=True) + + return items + + +def _resolve_openai_client( + client: FoundryChatClient | AsyncOpenAI | None = None, + project_client: AIProjectClient | None = None, +) -> AsyncOpenAI: + """Resolve an AsyncOpenAI client from a FoundryChatClient, raw client, or project_client.""" + if client is not None: + if isinstance(client, FoundryChatClient): + return client.client + return client + if project_client is not None: + oai = project_client.get_openai_client() + if oai is None: # pyright: ignore[reportUnnecessaryComparison] + raise ValueError("project_client.get_openai_client() returned None. Check project configuration.") + if not isinstance(oai, AsyncOpenAI): + raise TypeError( + "project_client.get_openai_client() returned a sync client. " + "FoundryEvals requires an async AIProjectClient (from azure.ai.projects.aio)." + ) + return oai + raise ValueError("Provide either 'client' or 'project_client'.") + + +async def _evaluate_via_responses_impl( + *, + client: AsyncOpenAI, + response_ids: Sequence[str], + evaluators: list[str], + model: str, + eval_name: str, + poll_interval: float, + timeout: float, + provider: str = "foundry", +) -> EvalResults: + """Evaluate using Foundry's Responses API retrieval path. + + Module-level helper used by both ``FoundryEvals`` and ``evaluate_traces``. + """ + eval_obj = await client.evals.create( + name=eval_name, + data_source_config={"type": "azure_ai_source", "scenario": "responses"}, # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + testing_criteria=_build_testing_criteria(evaluators, model), # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + ) + + data_source = { + "type": "azure_ai_responses", + "item_generation_params": { + "type": "response_retrieval", + "data_mapping": {"response_id": "{{item.resp_id}}"}, + "source": { + "type": "file_content", + "content": [{"item": {"resp_id": rid}} for rid in response_ids], + }, + }, + } + + run = await client.evals.runs.create( + eval_id=eval_obj.id, + name=f"{eval_name} Run", + data_source=data_source, # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + ) + + return await _poll_eval_run(client, eval_obj.id, run.id, poll_interval, timeout, provider=provider) + + +# --------------------------------------------------------------------------- +# FoundryEvals — Evaluator implementation for Microsoft Foundry +# --------------------------------------------------------------------------- + + +class FoundryEvals: + """Evaluation provider backed by Microsoft Foundry. + + Implements the ``Evaluator`` protocol so it can be passed to the + provider-agnostic ``evaluate_agent()`` and + ``evaluate_workflow()`` functions from ``agent_framework``. + + Also provides constants for built-in evaluator names for IDE + autocomplete and typo prevention: + + .. code-block:: python + + from agent_framework.foundry import FoundryEvals + + evaluators = [FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY] + + Examples: + Basic usage: + + .. code-block:: python + + from agent_framework import evaluate_agent + from agent_framework.foundry import FoundryEvals, FoundryChatClient + + chat_client = FoundryChatClient(model="gpt-4o") + evals = FoundryEvals(client=chat_client) + results = await evaluate_agent(agent=agent, queries=queries, evaluators=evals) + + Zero-config with environment variables (``FOUNDRY_PROJECT_ENDPOINT`` + and ``FOUNDRY_MODEL``): + + .. code-block:: python + + evals = FoundryEvals() # reads env vars via FoundryChatClient + + **Evaluator selection:** + + By default, runs ``relevance``, ``coherence``, and ``task_adherence``. + Automatically adds ``tool_call_accuracy`` when items contain tool + definitions. Override with ``evaluators=``. + + Args: + client: A ``FoundryChatClient`` instance. The ``builtin.*`` + evaluators are a Foundry feature and require a Foundry endpoint. + When omitted (and *project_client* is also omitted), a + ``FoundryChatClient`` is auto-created from ``FOUNDRY_PROJECT_ENDPOINT`` + and ``FOUNDRY_MODEL`` environment variables. + project_client: An ``AIProjectClient`` instance (sync or async). + Provide this or *client*. + model: Model deployment name for the evaluator LLM judge. + Resolved from ``client.model`` when omitted. + evaluators: Evaluator names (e.g. ``["relevance", "tool_call_accuracy"]``). + When ``None`` (default), uses smart defaults based on item data. + conversation_split: How to split multi-turn conversations into + query/response halves. Defaults to ``LAST_TURN``. Pass a + ``ConversationSplit`` enum value or a custom callable — see + ``ConversationSplitter``. + poll_interval: Seconds between status polls (default 5.0). + timeout: Maximum seconds to wait for completion (default 180.0). + """ + + # --------------------------------------------------------------------------- + # Built-in evaluator name constants + # --------------------------------------------------------------------------- + + # Agent behavior + INTENT_RESOLUTION: str = "intent_resolution" + TASK_ADHERENCE: str = "task_adherence" + TASK_COMPLETION: str = "task_completion" + TASK_NAVIGATION_EFFICIENCY: str = "task_navigation_efficiency" + + # Tool usage + TOOL_CALL_ACCURACY: str = "tool_call_accuracy" + TOOL_SELECTION: str = "tool_selection" + TOOL_INPUT_ACCURACY: str = "tool_input_accuracy" + TOOL_OUTPUT_UTILIZATION: str = "tool_output_utilization" + TOOL_CALL_SUCCESS: str = "tool_call_success" + + # Quality + COHERENCE: str = "coherence" + FLUENCY: str = "fluency" + RELEVANCE: str = "relevance" + GROUNDEDNESS: str = "groundedness" + RESPONSE_COMPLETENESS: str = "response_completeness" + SIMILARITY: str = "similarity" + + # Safety + VIOLENCE: str = "violence" + SEXUAL: str = "sexual" + SELF_HARM: str = "self_harm" + HATE_UNFAIRNESS: str = "hate_unfairness" + + def __init__( + self, + *, + client: FoundryChatClient | None = None, + project_client: AIProjectClient | None = None, + model: str | None = None, + evaluators: Sequence[str] | None = None, + conversation_split: ConversationSplitter = ConversationSplit.LAST_TURN, + poll_interval: float = 5.0, + timeout: float = 180.0, + ): + self.name = "Microsoft Foundry" + + # Auto-create a FoundryChatClient from env vars when no client is provided + if client is None and project_client is None: + client = FoundryChatClient(model=model or "gpt-4o") + + self._client = _resolve_openai_client(client, project_client) + # Resolve model: explicit param > client.model > error + resolved_model = model or (client.model if client is not None else None) + if not resolved_model: + raise ValueError( + "Model is required. Pass model= explicitly or use a FoundryChatClient that has a model configured." + ) + self._model = resolved_model + self._evaluators = list(evaluators) if evaluators is not None else None + self._conversation_split = conversation_split + self._poll_interval = poll_interval + self._timeout = timeout + + async def evaluate( + self, + items: Sequence[EvalItem], + *, + eval_name: str = "Agent Framework Eval", + ) -> EvalResults: + """Evaluate items using Foundry evaluators. + + Implements the ``Evaluator`` protocol. Automatically resolves default + evaluators and filters tool evaluators for items without tool definitions. + + Args: + items: Eval data items from ``AgentEvalConverter.to_eval_item()``. + eval_name: Display name for the evaluation run. + + Returns: + ``EvalResults`` with status, counts, and portal link. + """ + # Resolve evaluators with auto-detection + resolved = _resolve_default_evaluators(self._evaluators, items=items) + # Filter tool evaluators if items don't have tools + resolved = _filter_tool_evaluators(resolved, items) + + # Standard JSONL dataset path + return await self._evaluate_via_dataset(items, resolved, eval_name) + + # -- Internal evaluation paths -- + + async def _evaluate_via_dataset( + self, + items: Sequence[EvalItem], + evaluators: list[str], + eval_name: str, + ) -> EvalResults: + """Evaluate using JSONL dataset upload path.""" + dicts: list[dict[str, Any]] = [] + for item in items: + # Build JSONL dict directly from split_messages + converter + # to avoid splitting the conversation twice. + effective_split = item.split_strategy or self._conversation_split + query_msgs, response_msgs = item.split_messages(effective_split) + + query_text = " ".join(m.text for m in query_msgs if m.role == "user" and m.text).strip() + response_text = " ".join(m.text for m in response_msgs if m.role == "assistant" and m.text).strip() + + d: dict[str, Any] = { + "query": query_text, + "response": response_text, + "query_messages": AgentEvalConverter.convert_messages(query_msgs), + "response_messages": AgentEvalConverter.convert_messages(response_msgs), + } + if item.tools: + d["tool_definitions"] = [ + {"name": t.name, "description": t.description, "parameters": t.parameters()} for t in item.tools + ] + if item.context: + d["context"] = item.context + dicts.append(d) + + has_context = any("context" in d for d in dicts) + has_tools = any("tool_definitions" in d for d in dicts) + + eval_obj = await self._client.evals.create( + name=eval_name, + data_source_config={ # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + "type": "custom", + "item_schema": _build_item_schema(has_context=has_context, has_tools=has_tools), + "include_sample_schema": True, + }, + testing_criteria=_build_testing_criteria( # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + evaluators, + self._model, + include_data_mapping=True, + ), + ) + + data_source = { + "type": "jsonl", + "source": { + "type": "file_content", + "content": [{"item": d} for d in dicts], + }, + } + + run = await self._client.evals.runs.create( + eval_id=eval_obj.id, + name=f"{eval_name} Run", + data_source=data_source, # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + ) + + return await _poll_eval_run( + self._client, + eval_obj.id, + run.id, + self._poll_interval, + self._timeout, + provider=self.name, + ) + + +# --------------------------------------------------------------------------- +# Foundry-specific functions (not part of the Evaluator protocol) +# --------------------------------------------------------------------------- + + +async def evaluate_traces( + *, + evaluators: Sequence[str] | None = None, + client: FoundryChatClient | None = None, + project_client: AIProjectClient | None = None, + model: str, + response_ids: Sequence[str] | None = None, + trace_ids: Sequence[str] | None = None, + agent_id: str | None = None, + lookback_hours: int = 24, + eval_name: str = "Agent Framework Trace Eval", + poll_interval: float = 5.0, + timeout: float = 180.0, +) -> EvalResults: + """Evaluate agent behavior from OTel traces or response IDs. + + Foundry-specific function — works with any agent that emits OTel traces + to App Insights. Provide *response_ids* for specific responses, + *trace_ids* for specific traces, or *agent_id* with *lookback_hours* + to evaluate recent activity. + + Args: + evaluators: Evaluator names (e.g. ``[FoundryEvals.RELEVANCE]``). + Defaults to relevance, coherence, and task_adherence. + client: A ``FoundryChatClient`` instance. Provide this or *project_client*. + project_client: An ``AIProjectClient`` instance. + model: Model deployment name for the evaluator LLM judge. + response_ids: Evaluate specific Responses API responses. + trace_ids: Evaluate specific OTel trace IDs from App Insights. + agent_id: Filter traces by agent ID (used with *lookback_hours*). + lookback_hours: Hours of trace history to evaluate (default 24). + eval_name: Display name for the evaluation. + poll_interval: Seconds between status polls. + timeout: Maximum seconds to wait for completion. + + Returns: + ``EvalResults`` with status, result counts, and portal link. + + Example: + + .. code-block:: python + + results = await evaluate_traces( + response_ids=[response.response_id], + evaluators=[FoundryEvals.RELEVANCE], + client=chat_client, + model="gpt-4o", + ) + """ + oai_client = _resolve_openai_client(client, project_client) + resolved_evaluators = _resolve_default_evaluators(evaluators) + + if response_ids: + return await _evaluate_via_responses_impl( + client=oai_client, + response_ids=response_ids, + evaluators=resolved_evaluators, + model=model, + eval_name=eval_name, + poll_interval=poll_interval, + timeout=timeout, + ) + + if not trace_ids and not agent_id: + raise ValueError("Provide at least one of: response_ids, trace_ids, or agent_id") + + trace_source: dict[str, Any] = { + "type": "azure_ai_traces", + "lookback_hours": lookback_hours, + } + if trace_ids: + trace_source["trace_ids"] = list(trace_ids) + if agent_id: + trace_source["agent_id"] = agent_id + + eval_obj = await oai_client.evals.create( + name=eval_name, + data_source_config={"type": "azure_ai_source", "scenario": "traces"}, # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + testing_criteria=_build_testing_criteria(resolved_evaluators, model), # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + ) + + run = await oai_client.evals.runs.create( + eval_id=eval_obj.id, + name=f"{eval_name} Run", + data_source=trace_source, # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + ) + + return await _poll_eval_run(oai_client, eval_obj.id, run.id, poll_interval, timeout) + + +async def evaluate_foundry_target( + *, + target: dict[str, Any], + test_queries: Sequence[str], + evaluators: Sequence[str] | None = None, + client: FoundryChatClient | None = None, + project_client: AIProjectClient | None = None, + model: str, + eval_name: str = "Agent Framework Target Eval", + poll_interval: float = 5.0, + timeout: float = 180.0, +) -> EvalResults: + """Evaluate a Foundry-registered agent or model deployment. + + Foundry invokes the target, captures the output, and evaluates it. Use + this for scheduled evals, red teaming, and CI/CD quality gates. + + Args: + target: Target configuration dict. + test_queries: Queries for Foundry to send to the target. + evaluators: Evaluator names. + client: A ``FoundryChatClient`` instance. Provide this or *project_client*. + project_client: An ``AIProjectClient`` instance. + model: Model deployment name for the evaluator LLM judge. + eval_name: Display name for the evaluation. + poll_interval: Seconds between status polls. + timeout: Maximum seconds to wait for completion. + + Returns: + ``EvalResults`` with status, result counts, and portal link. + + Example: + + .. code-block:: python + + results = await evaluate_foundry_target( + target={"type": "azure_ai_agent", "name": "my-agent"}, + test_queries=["Book a flight to Paris"], + client=chat_client, + model="gpt-4o", + ) + """ + if "type" not in target: + raise ValueError("target dict must include a 'type' key (e.g., 'azure_ai_agent').") + oai_client = _resolve_openai_client(client, project_client) + resolved_evaluators = _resolve_default_evaluators(evaluators) + + eval_obj = await oai_client.evals.create( + name=eval_name, + data_source_config={ # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + "type": "azure_ai_source", + "scenario": "target_completions", + }, + testing_criteria=_build_testing_criteria(resolved_evaluators, model), # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + ) + + data_source: dict[str, Any] = { + "type": "azure_ai_target_completions", + "target": target, + "source": { + "type": "file_content", + "content": [{"item": {"query": q}} for q in test_queries], + }, + } + + run = await oai_client.evals.runs.create( + eval_id=eval_obj.id, + name=f"{eval_name} Run", + data_source=data_source, # type: ignore[arg-type] # pyright: ignore[reportArgumentType] + ) + + return await _poll_eval_run(oai_client, eval_obj.id, run.id, poll_interval, timeout) diff --git a/python/packages/foundry/tests/test_foundry_evals.py b/python/packages/foundry/tests/test_foundry_evals.py new file mode 100644 index 0000000000..03e21b5cdf --- /dev/null +++ b/python/packages/foundry/tests/test_foundry_evals.py @@ -0,0 +1,2475 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Tests for the AgentEvalConverter, FoundryEvals, and eval helper functions.""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest +from agent_framework import AgentExecutorResponse, AgentResponse, Content, FunctionTool, Message, WorkflowEvent +from agent_framework._evaluation import ( + AgentEvalConverter, + ConversationSplit, + EvalItem, + EvalNotPassedError, + EvalResults, + _extract_agent_eval_data, + _extract_overall_query, + evaluate_agent, + evaluate_workflow, +) +from agent_framework._workflows._workflow import WorkflowRunResult +from openai import AsyncOpenAI + +from agent_framework_foundry._foundry_evals import ( + FoundryEvals, + _build_item_schema, + _build_testing_criteria, + _extract_per_evaluator, + _extract_result_counts, + _filter_tool_evaluators, + _resolve_default_evaluators, + _resolve_evaluator, + _resolve_openai_client, +) + + +class _AsyncPage: + """Async-iterable mock for OpenAI SDK pagination pages.""" + + def __init__(self, items: list[Any]) -> None: + self._items = items + + def __aiter__(self) -> _AsyncPage: + self._iter = iter(self._items) + return self + + async def __anext__(self) -> Any: + try: + return next(self._iter) + except StopIteration: + raise StopAsyncIteration from None + + +def _make_tool(name: str) -> MagicMock: + """Create a mock FunctionTool for use in tests.""" + t = MagicMock() + t.name = name + t.description = f"{name} tool" + t.parameters = MagicMock(return_value={"type": "object"}) + return t + + +# --------------------------------------------------------------------------- +# _resolve_evaluator +# --------------------------------------------------------------------------- + + +class TestResolveEvaluator: + def test_short_name(self) -> None: + assert _resolve_evaluator("relevance") == "builtin.relevance" + assert _resolve_evaluator("tool_call_accuracy") == "builtin.tool_call_accuracy" + assert _resolve_evaluator("violence") == "builtin.violence" + + def test_already_qualified(self) -> None: + assert _resolve_evaluator("builtin.relevance") == "builtin.relevance" + assert _resolve_evaluator("builtin.custom") == "builtin.custom" + + def test_unknown_raises(self) -> None: + with pytest.raises(ValueError, match="Unknown evaluator 'bogus'"): + _resolve_evaluator("bogus") + + +# --------------------------------------------------------------------------- +# AgentEvalConverter.convert_message +# --------------------------------------------------------------------------- + + +class TestConvertMessage: + def test_user_text_message(self) -> None: + msg = Message("user", ["Hello, world!"]) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 1 + assert result[0] == {"role": "user", "content": [{"type": "text", "text": "Hello, world!"}]} + + def test_system_message(self) -> None: + msg = Message("system", ["You are helpful."]) + result = AgentEvalConverter.convert_message(msg) + assert result[0] == {"role": "system", "content": [{"type": "text", "text": "You are helpful."}]} + + def test_assistant_text_message(self) -> None: + msg = Message("assistant", ["Here is the answer."]) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 1 + assert result[0]["role"] == "assistant" + assert result[0]["content"] == [{"type": "text", "text": "Here is the answer."}] + assert len(result[0]["content"]) == 1 + + def test_assistant_with_tool_call(self) -> None: + msg = Message( + "assistant", + [ + Content.from_function_call( + call_id="call_1", + name="get_weather", + arguments=json.dumps({"location": "Seattle"}), + ), + ], + ) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 1 + assert result[0]["role"] == "assistant" + tc = result[0]["content"][0] + assert tc["type"] == "tool_call" + assert tc["tool_call_id"] == "call_1" + assert tc["name"] == "get_weather" + assert tc["arguments"] == {"location": "Seattle"} + + def test_assistant_text_and_tool_call(self) -> None: + msg = Message( + "assistant", + [ + Content.from_text("Let me check that."), + Content.from_function_call( + call_id="call_2", + name="search", + arguments={"query": "flights"}, + ), + ], + ) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 1 + assert result[0]["content"][0] == {"type": "text", "text": "Let me check that."} + tc = result[0]["content"][1] + assert tc["type"] == "tool_call" + assert tc["arguments"] == {"query": "flights"} + + def test_tool_result_message(self) -> None: + msg = Message( + "tool", + [ + Content.from_function_result( + call_id="call_1", + result="72°F, sunny", + ), + ], + ) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 1 + assert result[0]["role"] == "tool" + assert result[0]["tool_call_id"] == "call_1" + assert result[0]["content"] == [{"type": "tool_result", "tool_result": "72°F, sunny"}] + + def test_multiple_tool_results(self) -> None: + msg = Message( + "tool", + [ + Content.from_function_result(call_id="call_1", result="r1"), + Content.from_function_result(call_id="call_2", result="r2"), + ], + ) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 2 + assert result[0]["tool_call_id"] == "call_1" + assert result[1]["tool_call_id"] == "call_2" + + def test_non_string_result_kept_as_object(self) -> None: + msg = Message( + "tool", + [ + Content.from_function_result( + call_id="call_1", + result={"temp": 72, "unit": "F"}, + ), + ], + ) + result = AgentEvalConverter.convert_message(msg) + tr = result[0]["content"][0] + assert tr["type"] == "tool_result" + assert tr["tool_result"] == {"temp": 72, "unit": "F"} + + def test_empty_message(self) -> None: + msg = Message("user", []) + result = AgentEvalConverter.convert_message(msg) + assert result[0] == {"role": "user", "content": [{"type": "text", "text": ""}]} + + +# --------------------------------------------------------------------------- +# AgentEvalConverter.convert_messages +# --------------------------------------------------------------------------- + + +class TestConvertMessages: + def test_full_conversation(self) -> None: + messages = [ + Message("user", ["What's the weather?"]), + Message( + "assistant", + [Content.from_function_call(call_id="c1", name="get_weather", arguments='{"loc": "SEA"}')], + ), + Message("tool", [Content.from_function_result(call_id="c1", result="Sunny")]), + Message("assistant", ["It's sunny in Seattle!"]), + ] + result = AgentEvalConverter.convert_messages(messages) + assert len(result) == 4 + assert result[0]["role"] == "user" + assert result[1]["role"] == "assistant" + assert result[1]["content"][0]["type"] == "tool_call" + assert result[1]["content"][0]["name"] == "get_weather" + assert result[2]["role"] == "tool" + assert result[2]["content"][0]["type"] == "tool_result" + assert result[3]["role"] == "assistant" + assert result[3]["content"] == [{"type": "text", "text": "It's sunny in Seattle!"}] + + +# --------------------------------------------------------------------------- +# AgentEvalConverter.extract_tools +# --------------------------------------------------------------------------- + + +class TestExtractTools: + def test_extracts_function_tools(self) -> None: + tool = FunctionTool( + name="get_weather", + description="Get weather for a location", + func=lambda location: f"Sunny in {location}", + ) + agent = MagicMock() + agent.default_options = {"tools": [tool]} + + result = AgentEvalConverter.extract_tools(agent) + assert len(result) == 1 + assert result[0]["name"] == "get_weather" + assert result[0]["description"] == "Get weather for a location" + assert "parameters" in result[0] + + def test_skips_non_function_tools(self) -> None: + agent = MagicMock() + agent.default_options = {"tools": [{"type": "web_search"}, "some_string"]} + + result = AgentEvalConverter.extract_tools(agent) + assert len(result) == 0 + + def test_no_tools(self) -> None: + agent = MagicMock() + agent.default_options = {} + assert AgentEvalConverter.extract_tools(agent) == [] + + def test_no_default_options(self) -> None: + agent = MagicMock(spec=[]) # No attributes + assert AgentEvalConverter.extract_tools(agent) == [] + + +# --------------------------------------------------------------------------- +# AgentEvalConverter.to_eval_item (now returns EvalItem) +# --------------------------------------------------------------------------- + + +class TestToEvalItem: + def test_string_query(self) -> None: + response = AgentResponse(messages=[Message("assistant", ["The weather is sunny."])]) + item = AgentEvalConverter.to_eval_item(query="What's the weather?", response=response) + + assert isinstance(item, EvalItem) + assert item.query == "What's the weather?" + assert item.response == "The weather is sunny." + assert len(item.conversation) == 2 + assert item.conversation[0].role == "user" + assert item.conversation[1].role == "assistant" + + def test_message_query(self) -> None: + input_msgs = [ + Message("system", ["Be helpful."]), + Message("user", ["Hello"]), + ] + response = AgentResponse(messages=[Message("assistant", ["Hi there!"])]) + item = AgentEvalConverter.to_eval_item(query=input_msgs, response=response) + + assert item.query == "Hello" # Only user messages + assert len(item.conversation) == 3 # system + user + assistant + + def test_with_context(self) -> None: + response = AgentResponse(messages=[Message("assistant", ["Answer."])]) + item = AgentEvalConverter.to_eval_item( + query="Question?", + response=response, + context="Some reference document.", + ) + assert item.context == "Some reference document." + + def test_with_explicit_tools(self) -> None: + tool = FunctionTool( + name="search", + description="Search the web", + func=lambda q: f"Results for {q}", + ) + response = AgentResponse(messages=[Message("assistant", ["Found it."])]) + item = AgentEvalConverter.to_eval_item( + query="Find info", + response=response, + tools=[tool], + ) + assert item.tools is not None + assert len(item.tools) == 1 + assert item.tools[0].name == "search" + + def test_with_agent_tools(self) -> None: + tool = FunctionTool(name="calc", description="Calculate", func=lambda x: str(x)) + agent = MagicMock() + agent.default_options = {"tools": [tool]} + + response = AgentResponse(messages=[Message("assistant", ["42"])]) + item = AgentEvalConverter.to_eval_item( + query="What is 6*7?", + response=response, + agent=agent, + ) + assert item.tools is not None + assert item.tools[0].name == "calc" + + def test_explicit_tools_override_agent(self) -> None: + agent_tool = FunctionTool(name="agent_tool", description="from agent", func=lambda: "") + explicit_tool = FunctionTool(name="explicit_tool", description="explicit", func=lambda: "") + + agent = MagicMock() + agent.default_options = {"tools": [agent_tool]} + + response = AgentResponse(messages=[Message("assistant", ["Done"])]) + item = AgentEvalConverter.to_eval_item( + query="Test", + response=response, + agent=agent, + tools=[explicit_tool], + ) + assert item.tools is not None + assert len(item.tools) == 1 + assert item.tools[0].name == "explicit_tool" + + def test_split_messages_format(self) -> None: + """split_messages() should split conversation at last user message.""" + response = AgentResponse(messages=[Message("assistant", ["Answer"])]) + item = AgentEvalConverter.to_eval_item( + query="Q", + response=response, + tools=[FunctionTool(name="t", description="d", func=lambda: "")], + ) + query_msgs, response_msgs = item.split_messages() + # Single-turn: query has just the user msg, response has the assistant msg + assert len(query_msgs) == 1 + assert query_msgs[0].role == "user" + assert len(response_msgs) == 1 + assert response_msgs[0].role == "assistant" + # Tools preserved on item + assert item.tools is not None + assert len(item.tools) == 1 + assert item.tools[0].name == "t" + + def test_split_messages_multiturn_preserves_interleaving(self) -> None: + """Multi-turn split_messages() splits at last user message, preserving interleaving.""" + conversation = [ + Message("user", ["What's the weather?"]), + Message("assistant", ["It's sunny in Seattle."]), + Message("user", ["And tomorrow?"]), + Message("assistant", [Content(type="function_call", name="get_forecast")]), + Message("tool", [Content(type="function_result", result="Rain expected")]), + Message("assistant", ["Rain is expected tomorrow."]), + ] + item = EvalItem(conversation=conversation) + query_msgs, response_msgs = item.split_messages() + # query_messages: everything up to and including the last user message + assert len(query_msgs) == 3 # user, assistant, user + assert query_msgs[0].role == "user" + assert query_msgs[1].role == "assistant" # interleaved! + assert query_msgs[2].role == "user" + # response_messages: everything after the last user message + assert len(response_msgs) == 3 # assistant(tool_call), tool, assistant + assert response_msgs[0].role == "assistant" + assert response_msgs[1].role == "tool" + assert response_msgs[2].role == "assistant" + + def test_split_messages_full_split(self) -> None: + """ConversationSplit.FULL splits after the first user message.""" + conversation = [ + Message("user", ["What's the weather?"]), + Message("assistant", ["It's 62°F in Seattle."]), + Message("user", ["And tomorrow?"]), + Message("assistant", ["Rain is expected tomorrow."]), + ] + item = EvalItem(conversation=conversation) + query_msgs, response_msgs = item.split_messages(split=ConversationSplit.FULL) + # query_messages: just the first user message + assert len(query_msgs) == 1 + assert query_msgs[0].role == "user" + assert query_msgs[0].text == "What's the weather?" + # response_messages: everything after the first user message + assert len(response_msgs) == 3 + assert response_msgs[0].role == "assistant" + assert response_msgs[1].role == "user" + assert response_msgs[2].role == "assistant" + + def test_split_messages_full_split_with_system(self) -> None: + """FULL split includes system messages before the first user message in query.""" + conversation = [ + Message("system", ["You are a weather assistant."]), + Message("user", ["What's the weather?"]), + Message("assistant", ["It's sunny."]), + ] + item = EvalItem(conversation=conversation) + query_msgs, response_msgs = item.split_messages(split=ConversationSplit.FULL) + # query includes system + first user + assert len(query_msgs) == 2 + assert query_msgs[0].role == "system" + assert query_msgs[1].role == "user" + assert len(response_msgs) == 1 + + def test_split_messages_full_split_with_tools(self) -> None: + """FULL split puts all tool interactions in response_messages.""" + conversation = [ + Message("user", ["What's the weather?"]), + Message("assistant", [Content(type="function_call", name="get_weather")]), + Message("tool", [Content(type="function_result", result="62°F")]), + Message("assistant", ["It's 62°F."]), + Message("user", ["Thanks!"]), + Message("assistant", ["You're welcome!"]), + ] + item = EvalItem(conversation=conversation) + query_msgs, response_msgs = item.split_messages(split=ConversationSplit.FULL) + assert len(query_msgs) == 1 + assert len(response_msgs) == 5 + + def test_split_messages_last_turn_is_default(self) -> None: + """Default split_messages() uses LAST_TURN split.""" + conversation = [ + Message("user", ["Hello"]), + Message("assistant", ["Hi there"]), + Message("user", ["Bye"]), + Message("assistant", ["Goodbye"]), + ] + item = EvalItem(conversation=conversation) + q_default, r_default = item.split_messages() + q_explicit, r_explicit = item.split_messages(split=ConversationSplit.LAST_TURN) + assert [m.role for m in q_default] == [m.role for m in q_explicit] + assert [m.text for m in q_default] == [m.text for m in q_explicit] + assert [m.role for m in r_default] == [m.role for m in r_explicit] + assert [m.text for m in r_default] == [m.text for m in r_explicit] + + def test_per_turn_items_simple(self) -> None: + """per_turn_items produces one EvalItem per user message.""" + conversation = [ + Message("user", ["What's the weather?"]), + Message("assistant", ["It's 62°F."]), + Message("user", ["And tomorrow?"]), + Message("assistant", ["Rain expected."]), + ] + items = EvalItem.per_turn_items(conversation) + assert len(items) == 2 + + # Turn 1 + assert items[0].query == "What's the weather?" + assert items[0].response == "It's 62°F." + assert len(items[0].conversation) == 2 + + # Turn 2 — includes cumulative context; query joins all user texts in query split + assert items[1].query == "What's the weather? And tomorrow?" + assert items[1].response == "Rain expected." + assert len(items[1].conversation) == 4 + + def test_per_turn_items_with_tools(self) -> None: + """per_turn_items handles tool calls within a turn.""" + conversation = [ + Message("user", ["Check weather"]), + Message("assistant", [Content(type="function_call", name="get_weather")]), + Message("tool", [Content(type="function_result", result="sunny")]), + Message("assistant", ["It's sunny."]), + Message("user", ["Thanks"]), + Message("assistant", ["You're welcome!"]), + ] + tool_objs = [_make_tool("get_weather")] + items = EvalItem.per_turn_items(conversation, tools=tool_objs) + assert len(items) == 2 + + # Turn 1: response includes tool_call, tool_result, and final assistant + assert items[0].response == "It's sunny." + assert items[0].tools == tool_objs + assert len(items[0].conversation) == 4 # user, assistant(tool), tool, assistant + + # Turn 2 + assert items[1].response == "You're welcome!" + assert len(items[1].conversation) == 6 # full conversation + + def test_per_turn_items_empty(self) -> None: + """per_turn_items returns empty list when no user messages.""" + items = EvalItem.per_turn_items([Message("assistant", ["Hello"])]) + assert items == [] + + def test_per_turn_items_single_turn(self) -> None: + """per_turn_items with single turn produces one item.""" + conversation = [ + Message("user", ["Hi"]), + Message("assistant", ["Hello!"]), + ] + items = EvalItem.per_turn_items(conversation) + assert len(items) == 1 + assert items[0].query == "Hi" + assert items[0].response == "Hello!" + + def test_custom_splitter_callable(self) -> None: + """Custom callable splitter is used by split_messages().""" + conversation = [ + Message("user", ["Remember my name is Alice"]), + Message("assistant", ["Got it, Alice!"]), + Message("user", ["What's the capital of France?"]), + Message("assistant", [Content(type="function_call", name="retrieve_memory", call_id="m1")]), + Message("tool", [Content(type="function_result", call_id="m1", result="User name: Alice")]), + Message("assistant", ["The capital of France is Paris, Alice!"]), + ] + + def split_before_memory(conv): + """Split just before the memory retrieval tool call.""" + for i, msg in enumerate(conv): + for c in msg.contents: + if c.name == "retrieve_memory": + return conv[:i], conv[i:] + return EvalItem._split_last_turn_static(conv) + + item = EvalItem(conversation=conversation) + query_msgs, response_msgs = item.split_messages(split=split_before_memory) + + # split_before_memory finds "retrieve_memory" at conv[3] (assistant tool_call msg) + # query = conv[:3] = [user, assistant, user] + # response = conv[3:] = [assistant(tool_call), tool, assistant] + assert len(query_msgs) == 3 + assert query_msgs[-1].role == "user" + assert len(response_msgs) == 3 + assert response_msgs[0].role == "assistant" # the tool_call msg + + def test_custom_splitter_with_fallback(self) -> None: + """Custom splitter falls back to _split_last_turn_static when pattern not found.""" + conversation = [ + Message("user", ["Hello"]), + Message("assistant", ["Hi there!"]), + ] + + def split_before_memory(conv): + for i, msg in enumerate(conv): + for c in msg.contents: + if c.name == "retrieve_memory": + return conv[:i], conv[i:] + return EvalItem._split_last_turn_static(conv) + + item = EvalItem(conversation=conversation) + query_msgs, response_msgs = item.split_messages(split=split_before_memory) + # Falls back to last-turn split + assert len(query_msgs) == 1 + assert query_msgs[0].role == "user" + assert len(response_msgs) == 1 + assert response_msgs[0].role == "assistant" + + def test_custom_splitter_lambda(self) -> None: + """A lambda works as a custom splitter.""" + conversation = [ + Message("user", ["A"]), + Message("assistant", ["B"]), + Message("user", ["C"]), + Message("assistant", ["D"]), + ] + # Split at index 2 (arbitrary) + item = EvalItem(conversation=conversation) + query_msgs, response_msgs = item.split_messages(split=lambda conv: (conv[:2], conv[2:])) + assert len(query_msgs) == 2 + assert len(response_msgs) == 2 + + def test_split_strategy_on_item_used_by_split_messages(self) -> None: + """split_strategy field on EvalItem is used as default by split_messages().""" + conversation = [ + Message("user", ["First"]), + Message("assistant", ["Response 1"]), + Message("user", ["Second"]), + Message("assistant", ["Response 2"]), + ] + item = EvalItem( + conversation=conversation, + split_strategy=ConversationSplit.FULL, + ) + # split_messages() with no split arg should use item.split_strategy + query_msgs, response_msgs = item.split_messages() + assert len(query_msgs) == 1 # FULL: just first user msg + assert query_msgs[0].text == "First" + assert len(response_msgs) == 3 + + def test_explicit_split_overrides_item_split_strategy(self) -> None: + """Explicit split= arg to split_messages() overrides item.split_strategy.""" + conversation = [ + Message("user", ["First"]), + Message("assistant", ["Response 1"]), + Message("user", ["Second"]), + Message("assistant", ["Response 2"]), + ] + item = EvalItem( + conversation=conversation, + split_strategy=ConversationSplit.FULL, + ) + # Explicit split= should override split_strategy + query_msgs, response_msgs = item.split_messages(split=ConversationSplit.LAST_TURN) + assert len(query_msgs) == 3 # LAST_TURN: up to last user + assert query_msgs[-1].text == "Second" + assert len(response_msgs) == 1 + + def test_no_split_defaults_to_last_turn(self) -> None: + """When neither split= nor split_strategy is set, defaults to LAST_TURN.""" + conversation = [ + Message("user", ["Hello"]), + Message("assistant", ["Hi"]), + ] + item = EvalItem(conversation=conversation) + assert item.split_strategy is None + query_msgs, response_msgs = item.split_messages() + assert len(query_msgs) == 1 + assert query_msgs[0].role == "user" + + +# --------------------------------------------------------------------------- +# _build_testing_criteria +# --------------------------------------------------------------------------- + + +class TestBuildTestingCriteria: + def test_without_data_mapping(self) -> None: + criteria = _build_testing_criteria(["relevance", "coherence"], "gpt-4o") + assert len(criteria) == 2 + assert criteria[0]["evaluator_name"] == "builtin.relevance" + assert criteria[0]["initialization_parameters"] == {"deployment_name": "gpt-4o"} + assert "data_mapping" not in criteria[0] + + def test_with_data_mapping(self) -> None: + criteria = _build_testing_criteria(["relevance", "groundedness"], "gpt-4o", include_data_mapping=True) + assert "data_mapping" in criteria[0] + # Quality evaluators should NOT have conversation + assert criteria[0]["data_mapping"] == { + "query": "{{item.query}}", + "response": "{{item.response}}", + } + # Groundedness has an extra context mapping + assert "context" in criteria[1]["data_mapping"] + assert "conversation" not in criteria[1]["data_mapping"] + + def test_tool_evaluator_includes_tool_definitions(self) -> None: + criteria = _build_testing_criteria(["relevance", "tool_call_accuracy"], "gpt-4o", include_data_mapping=True) + # relevance: string query/response + assert criteria[0]["data_mapping"]["query"] == "{{item.query}}" + assert criteria[0]["data_mapping"]["response"] == "{{item.response}}" + assert "tool_definitions" not in criteria[0]["data_mapping"] + # tool_call_accuracy: array query/response + tool_definitions + assert criteria[1]["data_mapping"]["query"] == "{{item.query_messages}}" + assert criteria[1]["data_mapping"]["response"] == "{{item.response_messages}}" + assert criteria[1]["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}" + + def test_agent_evaluators_use_message_arrays(self) -> None: + agent_evals = ["task_adherence", "intent_resolution", "task_completion"] + criteria = _build_testing_criteria(agent_evals, "gpt-4o", include_data_mapping=True) + for c in criteria: + assert c["data_mapping"]["query"] == "{{item.query_messages}}", f"{c['name']}" + assert c["data_mapping"]["response"] == "{{item.response_messages}}", f"{c['name']}" + + def test_quality_evaluators_use_strings(self) -> None: + quality_evals = ["coherence", "relevance", "fluency"] + criteria = _build_testing_criteria(quality_evals, "gpt-4o", include_data_mapping=True) + for c in criteria: + assert c["data_mapping"]["query"] == "{{item.query}}", f"{c['name']}" + assert c["data_mapping"]["response"] == "{{item.response}}", f"{c['name']}" + + def test_all_tool_evaluators_include_tool_definitions(self) -> None: + tool_evals = [ + "tool_call_accuracy", + "tool_selection", + "tool_input_accuracy", + "tool_output_utilization", + "tool_call_success", + ] + criteria = _build_testing_criteria(tool_evals, "gpt-4o", include_data_mapping=True) + for c in criteria: + assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions" + + +# --------------------------------------------------------------------------- +# _build_item_schema +# --------------------------------------------------------------------------- + + +class TestBuildItemSchema: + def test_without_context(self) -> None: + schema = _build_item_schema(has_context=False) + assert "context" not in schema["properties"] + assert schema["required"] == ["query", "response"] + + def test_with_context(self) -> None: + schema = _build_item_schema(has_context=True) + assert "context" in schema["properties"] + + def test_with_tools(self) -> None: + schema = _build_item_schema(has_tools=True) + assert "tool_definitions" in schema["properties"] + + def test_with_context_and_tools(self) -> None: + schema = _build_item_schema(has_context=True, has_tools=True) + assert "context" in schema["properties"] + assert "tool_definitions" in schema["properties"] + + +# --------------------------------------------------------------------------- +# FoundryEvals (constructor, name, select, evaluate via dataset) +# --------------------------------------------------------------------------- + + +class TestFoundryEvals: + def test_constructor_with_openai_client(self) -> None: + mock_client = MagicMock() + fe = FoundryEvals(client=mock_client, model="gpt-4o") + assert fe.name == "Microsoft Foundry" + + def test_constructor_with_project_client(self) -> None: + mock_oai = MagicMock(spec=AsyncOpenAI) + mock_project = MagicMock() + mock_project.get_openai_client.return_value = mock_oai + fe = FoundryEvals(project_client=mock_project, model="gpt-4o") + assert fe.name == "Microsoft Foundry" + mock_project.get_openai_client.assert_called_once() + + def test_constructor_no_client_auto_creates_from_env(self) -> None: + """When no client/project_client given, auto-creates FoundryChatClient from env.""" + import os + from unittest.mock import patch + + with patch.dict(os.environ, {}, clear=True), pytest.raises((ValueError, Exception)): + FoundryEvals(model="gpt-4o") + + def test_name_property(self) -> None: + fe = FoundryEvals(client=MagicMock(), model="gpt-4o") + assert fe.name == "Microsoft Foundry" + + def test_evaluators_passed_in_constructor(self) -> None: + fe = FoundryEvals( + client=MagicMock(), + model="gpt-4o", + evaluators=["relevance", "coherence"], + ) + assert fe._evaluators == ["relevance", "coherence"] + + async def test_evaluate_calls_evals_api(self) -> None: + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_123" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_456" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 2, "failed": 0} + mock_completed.report_url = "https://portal.azure.com/eval/run_456" + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + # Mock output_items.list so _fetch_output_items exercises the full flow + mock_output_item = MagicMock() + mock_output_item.id = "output_item_1" + mock_output_item.status = "pass" + mock_output_item.sample = MagicMock(error=None, usage=None, input=[], output=[]) + mock_result = MagicMock(status="pass", score=5, reason="Relevant response") + mock_result.name = "relevance" # MagicMock(name=...) sets display name, not .name attr + mock_output_item.results = [mock_result] + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_output_item])) + + items = [ + EvalItem(conversation=[Message("user", ["Hello"]), Message("assistant", ["Hi there!"])]), + EvalItem(conversation=[Message("user", ["Weather?"]), Message("assistant", ["Sunny."])]), + ] + + fe = FoundryEvals( + client=mock_client, + model="gpt-4o", + evaluators=[FoundryEvals.RELEVANCE], + ) + results = await fe.evaluate(items) + + assert isinstance(results, EvalResults) + assert results.status == "completed" + assert results.eval_id == "eval_123" + assert results.run_id == "run_456" + assert results.report_url == "https://portal.azure.com/eval/run_456" + assert results.all_passed + assert results.passed == 2 + assert results.failed == 0 + + # Verify per-item output_items were fetched + assert len(results.items) == 1 + assert results.items[0].item_id == "output_item_1" + assert results.items[0].status == "pass" + assert len(results.items[0].scores) == 1 + assert results.items[0].scores[0].name == "relevance" + assert results.items[0].scores[0].score == 5 + + # Verify evals.create was called with correct structure + create_call = mock_client.evals.create.call_args + assert create_call.kwargs["name"] == "Agent Framework Eval" + assert create_call.kwargs["data_source_config"]["type"] == "custom" + + # Verify evals.runs.create was called with JSONL data source + run_call = mock_client.evals.runs.create.call_args + assert run_call.kwargs["data_source"]["type"] == "jsonl" + content = run_call.kwargs["data_source"]["source"]["content"] + assert len(content) == 2 + + async def test_evaluate_uses_default_evaluators(self) -> None: + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_1" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_1" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + fe = FoundryEvals(client=mock_client, model="gpt-4o") + await fe.evaluate([EvalItem(conversation=[Message("user", ["Hi"]), Message("assistant", ["Hello"])])]) + + # Verify default evaluators were used + create_call = mock_client.evals.create.call_args + criteria = create_call.kwargs["testing_criteria"] + names = {c["name"] for c in criteria} + assert "relevance" in names + assert "coherence" in names + assert "task_adherence" in names + + async def test_evaluate_uses_dataset_path(self) -> None: + """Items use the JSONL dataset path.""" + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_ds" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_ds" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + items = [ + EvalItem( + conversation=[Message("user", ["What's the weather?"]), Message("assistant", ["Sunny"])], + ), + ] + + fe = FoundryEvals(client=mock_client, model="gpt-4o") + await fe.evaluate(items) + + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "jsonl" + content = ds["source"]["content"] + assert content[0]["item"]["query"] == "What's the weather?" + + async def test_evaluate_with_tool_items_uses_dataset_path(self) -> None: + """Items with tool_definitions use the dataset path.""" + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tool" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tool" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + items = [ + EvalItem( + conversation=[Message("user", ["Do the thing"]), Message("assistant", ["Done"])], + tools=[_make_tool("my_tool")], + ), + ] + + fe = FoundryEvals( + client=mock_client, + model="gpt-4o", + evaluators=[FoundryEvals.TOOL_CALL_ACCURACY], + ) + await fe.evaluate(items) + + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "jsonl" + assert "tool_definitions" in ds["source"]["content"][0]["item"] + + async def test_evaluate_with_project_client(self) -> None: + mock_oai = MagicMock(spec=AsyncOpenAI) + mock_project = MagicMock() + mock_project.get_openai_client.return_value = mock_oai + + mock_eval = MagicMock() + mock_eval.id = "eval_pc" + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_pc" + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + fe = FoundryEvals(project_client=mock_project, model="gpt-4o") + results = await fe.evaluate([EvalItem(conversation=[Message("user", ["Hi"]), Message("assistant", ["Hello"])])]) + + assert results.status == "completed" + mock_project.get_openai_client.assert_called_once() + + +# --------------------------------------------------------------------------- +# FoundryEvals constants +# --------------------------------------------------------------------------- + + +class TestEvaluators: + def test_constants_resolve(self) -> None: + assert _resolve_evaluator(FoundryEvals.RELEVANCE) == "builtin.relevance" + assert _resolve_evaluator(FoundryEvals.TOOL_CALL_ACCURACY) == "builtin.tool_call_accuracy" + assert _resolve_evaluator(FoundryEvals.VIOLENCE) == "builtin.violence" + assert _resolve_evaluator(FoundryEvals.INTENT_RESOLUTION) == "builtin.intent_resolution" + + def test_all_constants_are_valid(self) -> None: + for attr in dir(FoundryEvals): + if attr.startswith("_"): + continue + value = getattr(FoundryEvals, attr) + if isinstance(value, str): + _resolve_evaluator(value) # should not raise + + +# --------------------------------------------------------------------------- +# _resolve_default_evaluators +# --------------------------------------------------------------------------- + + +class TestResolveDefaultEvaluators: + def test_explicit_evaluators_passthrough(self) -> None: + result = _resolve_default_evaluators([FoundryEvals.VIOLENCE]) + assert result == [FoundryEvals.VIOLENCE] + + def test_none_gives_defaults(self) -> None: + result = _resolve_default_evaluators(None) + assert FoundryEvals.RELEVANCE in result + assert FoundryEvals.COHERENCE in result + assert FoundryEvals.TASK_ADHERENCE in result + assert FoundryEvals.TOOL_CALL_ACCURACY not in result + + def test_none_with_tool_items_adds_tool_eval(self) -> None: + items = [ + EvalItem( + conversation=[Message("user", ["search for stuff"]), Message("assistant", ["found it"])], + tools=[_make_tool("search")], + ), + ] + result = _resolve_default_evaluators(None, items=items) + assert FoundryEvals.TOOL_CALL_ACCURACY in result + + def test_explicit_evaluators_ignore_tool_items(self) -> None: + items = [ + EvalItem( + conversation=[Message("user", ["search"]), Message("assistant", ["found"])], + tools=[_make_tool("search")], + ), + ] + result = _resolve_default_evaluators([FoundryEvals.RELEVANCE], items=items) + assert result == [FoundryEvals.RELEVANCE] + + +# --------------------------------------------------------------------------- +# _filter_tool_evaluators +# --------------------------------------------------------------------------- + + +class TestFilterToolEvaluators: + def test_keeps_tool_evaluators_when_items_have_tools(self) -> None: + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])], tools=[_make_tool("t")]), + ] + result = _filter_tool_evaluators( + ["relevance", "tool_call_accuracy"], + items, + ) + assert "relevance" in result + assert "tool_call_accuracy" in result + + def test_removes_tool_evaluators_when_no_tools(self) -> None: + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]), + ] + result = _filter_tool_evaluators( + ["relevance", "tool_call_accuracy"], + items, + ) + assert "relevance" in result + assert "tool_call_accuracy" not in result + + def test_raises_when_all_filtered(self) -> None: + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]), + ] + with pytest.raises(ValueError, match="require tool definitions"): + _filter_tool_evaluators( + ["tool_call_accuracy", "tool_selection"], + items, + ) + + +# --------------------------------------------------------------------------- +# EvalResults +# --------------------------------------------------------------------------- + + +class TestEvalResults: + def test_all_passed_true(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 3, "failed": 0, "errored": 0}, + ) + assert r.all_passed + assert r.passed == 3 + assert r.failed == 0 + assert r.total == 3 + + def test_all_passed_false_on_failure(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 2, "failed": 1, "errored": 0}, + ) + assert not r.all_passed + assert r.failed == 1 + + def test_all_passed_false_on_error(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 2, "failed": 0, "errored": 1}, + ) + assert not r.all_passed + + def test_all_passed_false_on_non_completed(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="timeout", + result_counts={"passed": 2, "failed": 0, "errored": 0}, + ) + assert not r.all_passed + + def test_all_passed_false_on_empty(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 0, "failed": 0, "errored": 0}, + ) + assert not r.all_passed + + def test_raise_for_status_succeeds(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 1, "failed": 0, "errored": 0}, + ) + r.raise_for_status() # should not raise + + def test_raise_for_status_raises(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 1, "failed": 1, "errored": 0}, + ) + with pytest.raises(EvalNotPassedError, match="1 passed, 1 failed"): + r.raise_for_status() + + def test_raise_for_status_custom_message(self) -> None: + r = EvalResults(provider="test", eval_id="e", run_id="r", status="failed") + with pytest.raises(EvalNotPassedError, match="custom error"): + r.raise_for_status("custom error") + + def test_none_result_counts(self) -> None: + r = EvalResults(provider="test", eval_id="e", run_id="r", status="completed") + assert r.passed == 0 + assert r.failed == 0 + assert r.total == 0 + assert not r.all_passed + + +# --------------------------------------------------------------------------- +# _resolve_openai_client +# --------------------------------------------------------------------------- + + +class TestResolveOpenAIClient: + def test_explicit_client(self) -> None: + mock_client = MagicMock() + assert _resolve_openai_client(client=mock_client) is mock_client + + def test_project_client(self) -> None: + mock_oai = MagicMock(spec=AsyncOpenAI) + mock_project = MagicMock() + mock_project.get_openai_client.return_value = mock_oai + + result = _resolve_openai_client(project_client=mock_project) + assert result is mock_oai + mock_project.get_openai_client.assert_called_once() + + def test_explicit_takes_precedence(self) -> None: + mock_client = MagicMock() + mock_project = MagicMock() + + result = _resolve_openai_client(client=mock_client, project_client=mock_project) + assert result is mock_client + mock_project.get_openai_client.assert_not_called() + + def test_neither_raises(self) -> None: + with pytest.raises(ValueError, match="Provide either"): + _resolve_openai_client() + + +# --------------------------------------------------------------------------- +# evaluate_agent with responses= (core function, uses FoundryEvals as evaluator) +# --------------------------------------------------------------------------- + + +class TestEvaluateAgentWithResponses: + async def test_responses_without_queries_raises(self) -> None: + mock_oai = MagicMock() + response = AgentResponse(messages=[Message("assistant", ["Hello"])]) + + with pytest.raises(ValueError, match="Provide 'queries' alongside 'responses'"): + await evaluate_agent( + responses=response, + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + ) + + async def test_fallback_to_dataset_with_query(self) -> None: + """Non-Responses-API: falls back to dataset path when query is provided.""" + mock_oai = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_fb" + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_fb" + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = "https://portal.azure.com/eval" + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + response = AgentResponse(messages=[Message("assistant", ["It's sunny."])]) + + results = await evaluate_agent( + responses=response, + queries=["What's the weather?"], + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + ) + + assert results[0].status == "completed" + assert results[0].all_passed + + # Should use jsonl data source (dataset path), not azure_ai_responses + run_call = mock_oai.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "jsonl" + content = ds["source"]["content"] + assert len(content) == 1 + assert content[0]["item"]["query"] == "What's the weather?" + assert content[0]["item"]["response"] == "It's sunny." + + async def test_fallback_with_agent_extracts_tools(self) -> None: + """Non-Responses-API with agent: tool definitions are included in the eval item.""" + mock_oai = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tools" + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tools" + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + mock_agent = MagicMock() + mock_agent.default_options = { + "tools": [FunctionTool(name="my_tool", description="A test tool", func=lambda x: x)] + } + + response = AgentResponse(messages=[Message("assistant", ["Result."])]) + + results = await evaluate_agent( + responses=response, + queries=["Do the thing"], + agent=mock_agent, + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + ) + + assert results[0].status == "completed" + + run_call = mock_oai.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + content = ds["source"]["content"] + item = content[0]["item"] + assert "tool_definitions" in item + tool_defs = item["tool_definitions"] + assert any(t["name"] == "my_tool" for t in tool_defs) + + async def test_fallback_multiple_responses_with_queries(self) -> None: + """Non-Responses-API with multiple responses requires matching queries.""" + mock_oai = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_multi_fb" + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_multi_fb" + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 2, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + responses = [ + AgentResponse(messages=[Message("assistant", ["Answer 1"])]), + AgentResponse(messages=[Message("assistant", ["Answer 2"])]), + ] + + results = await evaluate_agent( + responses=responses, + queries=["Question 1", "Question 2"], + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + ) + + assert results[0].passed == 2 + run_call = mock_oai.evals.runs.create.call_args + content = run_call.kwargs["data_source"]["source"]["content"] + assert len(content) == 2 + assert content[0]["item"]["query"] == "Question 1" + assert content[1]["item"]["query"] == "Question 2" + + async def test_query_response_count_mismatch_raises(self) -> None: + """Mismatched query and response counts should raise.""" + mock_oai = MagicMock() + + responses = [ + AgentResponse(messages=[Message("assistant", ["A1"])]), + AgentResponse(messages=[Message("assistant", ["A2"])]), + ] + + with pytest.raises(ValueError, match="queries but"): + await evaluate_agent( + responses=responses, + queries=["Q1", "Q2", "Q3"], + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + ) + + async def test_tool_evaluators_with_query_and_agent_uses_dataset_path(self) -> None: + """Tool evaluators with query+agent uses dataset path.""" + mock_oai = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tool" + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tool" + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + response = AgentResponse( + messages=[Message("assistant", ["It's sunny"])], + ) + + agent = MagicMock() + agent.default_options = { + "tools": [ + FunctionTool(name="get_weather", description="Get weather", func=lambda: None), + ] + } + + fe = FoundryEvals( + client=mock_oai, + model="gpt-4o", + evaluators=[FoundryEvals.TOOL_CALL_ACCURACY], + ) + + await evaluate_agent( + responses=response, + queries=["What's the weather?"], + agent=agent, + evaluators=fe, + ) + + # Verify it used the dataset path (jsonl), not Responses API path + run_call = mock_oai.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "jsonl" + + # Verify tool_definitions are in the data items + items = ds["source"]["content"] + assert "tool_definitions" in items[0]["item"] + + +# --------------------------------------------------------------------------- +# EvalResults.sub_results +# --------------------------------------------------------------------------- + + +class TestEvalResultsSubResults: + def test_sub_results_default_empty(self) -> None: + r = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 1, "failed": 0}, + ) + assert r.sub_results == {} + assert r.all_passed + + def test_all_passed_checks_sub_results(self) -> None: + parent = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 2, "failed": 0}, + sub_results={ + "agent-a": EvalResults( + provider="test", + eval_id="e2", + run_id="r2", + status="completed", + result_counts={"passed": 1, "failed": 0}, + ), + "agent-b": EvalResults( + provider="test", + eval_id="e3", + run_id="r3", + status="completed", + result_counts={"passed": 1, "failed": 1}, + ), + }, + ) + assert not parent.all_passed # agent-b has a failure + + def test_all_passed_with_all_sub_passing(self) -> None: + parent = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 2, "failed": 0}, + sub_results={ + "agent-a": EvalResults( + provider="test", + eval_id="e2", + run_id="r2", + status="completed", + result_counts={"passed": 1, "failed": 0}, + ), + }, + ) + assert parent.all_passed + + def test_raise_for_status_includes_failed_agents(self) -> None: + parent = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 2, "failed": 0}, + sub_results={ + "good-agent": EvalResults( + provider="test", + eval_id="e2", + run_id="r2", + status="completed", + result_counts={"passed": 1, "failed": 0}, + ), + "bad-agent": EvalResults( + provider="test", + eval_id="e3", + run_id="r3", + status="completed", + result_counts={"passed": 0, "failed": 1}, + ), + }, + ) + with pytest.raises(EvalNotPassedError, match="bad-agent"): + parent.raise_for_status() + + +# --------------------------------------------------------------------------- +# _extract_agent_eval_data +# --------------------------------------------------------------------------- + + +def _make_agent_exec_response( + executor_id: str, + response_text: str, + user_messages: list[str] | None = None, +) -> AgentExecutorResponse: + """Helper to build an AgentExecutorResponse for testing.""" + agent_response = AgentResponse(messages=[Message("assistant", [response_text])]) + full_conv: list[Message] = [] + if user_messages: + for m in user_messages: + full_conv.append(Message("user", [m])) + full_conv.extend(agent_response.messages) + return AgentExecutorResponse( + executor_id=executor_id, + agent_response=agent_response, + full_conversation=full_conv, + ) + + +class TestExtractAgentEvalData: + def test_extracts_single_agent(self) -> None: + aer = _make_agent_exec_response("planner", "Plan is ready", ["Plan a trip"]) + + events = [ + WorkflowEvent.executor_invoked("planner", "Plan a trip"), + WorkflowEvent.executor_completed("planner", [aer]), + ] + result = WorkflowRunResult(events, []) + + data = _extract_agent_eval_data(result) + assert len(data) == 1 + assert data[0]["executor_id"] == "planner" + assert data[0]["response"].text == "Plan is ready" + + def test_extracts_multiple_agents(self) -> None: + aer1 = _make_agent_exec_response("planner", "Plan done", ["Plan a trip"]) + aer2 = _make_agent_exec_response("booker", "Booked!", ["Book flight"]) + + events = [ + WorkflowEvent.executor_invoked("planner", "Plan a trip"), + WorkflowEvent.executor_completed("planner", [aer1]), + WorkflowEvent.executor_invoked("booker", "Book flight"), + WorkflowEvent.executor_completed("booker", [aer2]), + ] + result = WorkflowRunResult(events, []) + + data = _extract_agent_eval_data(result) + assert len(data) == 2 + assert data[0]["executor_id"] == "planner" + assert data[1]["executor_id"] == "booker" + + def test_skips_internal_executors(self) -> None: + aer = _make_agent_exec_response("planner", "Done", ["Go"]) + + events = [ + WorkflowEvent.executor_invoked("input-conversation", "hello"), + WorkflowEvent.executor_completed("input-conversation", ["hello"]), + WorkflowEvent.executor_invoked("planner", "Go"), + WorkflowEvent.executor_completed("planner", [aer]), + WorkflowEvent.executor_invoked("end", []), + WorkflowEvent.executor_completed("end", None), + ] + result = WorkflowRunResult(events, []) + + data = _extract_agent_eval_data(result) + assert len(data) == 1 + assert data[0]["executor_id"] == "planner" + + def test_resolves_agent_from_workflow(self) -> None: + aer = _make_agent_exec_response("my-agent", "Done", ["Do it"]) + + events = [ + WorkflowEvent.executor_invoked("my-agent", "Do it"), + WorkflowEvent.executor_completed("my-agent", [aer]), + ] + result = WorkflowRunResult(events, []) + + # Build a mock workflow with AgentExecutor + from agent_framework import AgentExecutor + + mock_agent = MagicMock() + mock_agent.default_options = {"tools": []} + mock_executor = MagicMock(spec=AgentExecutor) + mock_executor.agent = mock_agent + + mock_workflow = MagicMock() + mock_workflow.executors = {"my-agent": mock_executor} + + data = _extract_agent_eval_data(result, mock_workflow) + assert len(data) == 1 + assert data[0]["agent"] is mock_agent + + +class TestExtractOverallQuery: + def test_extracts_string_query(self) -> None: + events = [WorkflowEvent.executor_invoked("input", "Plan a trip")] + result = WorkflowRunResult(events, []) + assert _extract_overall_query(result) == "Plan a trip" + + def test_extracts_message_query(self) -> None: + msgs = [Message("user", ["What's the weather?"])] + events = [WorkflowEvent.executor_invoked("input", msgs)] + result = WorkflowRunResult(events, []) + assert "What's the weather?" in (_extract_overall_query(result) or "") + + def test_returns_none_for_empty(self) -> None: + result = WorkflowRunResult([], []) + assert _extract_overall_query(result) is None + + +# --------------------------------------------------------------------------- +# evaluate_workflow (core function, uses FoundryEvals as evaluator) +# --------------------------------------------------------------------------- + + +class TestEvaluateWorkflow: + def _mock_oai_client(self, eval_id: str = "eval_wf", run_id: str = "run_wf") -> MagicMock: + mock_oai = MagicMock() + mock_eval = MagicMock() + mock_eval.id = eval_id + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + mock_run = MagicMock() + mock_run.id = run_id + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = "https://portal.azure.com/eval" + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + return mock_oai + + async def test_post_hoc_with_workflow_result(self) -> None: + """Evaluate a workflow result that was already produced.""" + mock_oai = self._mock_oai_client() + + aer1 = _make_agent_exec_response("writer", "Draft written", ["Write about Paris"]) + aer2 = _make_agent_exec_response("reviewer", "Looks good!", ["Review: Draft written"]) + + final_output = [Message("assistant", ["Final reviewed output"])] + + events = [ + WorkflowEvent.executor_invoked("input-conversation", "Write about Paris"), + WorkflowEvent.executor_completed("input-conversation", None), + WorkflowEvent.executor_invoked("writer", "Write about Paris"), + WorkflowEvent.executor_completed("writer", [aer1]), + WorkflowEvent.executor_invoked("reviewer", [aer1]), + WorkflowEvent.executor_completed("reviewer", [aer2]), + WorkflowEvent.output("end", final_output), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + + results = await evaluate_workflow( + workflow=mock_workflow, + workflow_result=wf_result, + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + include_overall=False, + ) + + assert results[0].status == "completed" + assert "writer" in results[0].sub_results + assert "reviewer" in results[0].sub_results + assert len(results[0].sub_results) == 2 + + async def test_with_queries_runs_workflow(self) -> None: + """Passing queries= runs the workflow and evaluates.""" + mock_oai = self._mock_oai_client() + + aer = _make_agent_exec_response("agent", "Response", ["Query"]) + final_output = [Message("assistant", ["Final"])] + + events = [ + WorkflowEvent.executor_invoked("agent", "Test query"), + WorkflowEvent.executor_completed("agent", [aer]), + WorkflowEvent.output("end", final_output), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + mock_workflow.run = AsyncMock(return_value=wf_result) + + results = await evaluate_workflow( + workflow=mock_workflow, + queries=["Test query"], + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + include_overall=False, + ) + + mock_workflow.run.assert_called_once_with("Test query") + assert "agent" in results[0].sub_results + + async def test_overall_plus_per_agent(self) -> None: + """Both overall and per-agent evals run by default.""" + mock_oai = self._mock_oai_client() + + aer = _make_agent_exec_response("planner", "Plan done", ["Plan trip"]) + final_output = [Message("assistant", ["Trip planned!"])] + + events = [ + WorkflowEvent.executor_invoked("input-conversation", "Plan trip"), + WorkflowEvent.executor_completed("input-conversation", None), + WorkflowEvent.executor_invoked("planner", "Plan trip"), + WorkflowEvent.executor_completed("planner", [aer]), + WorkflowEvent.output("end", final_output), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + + results = await evaluate_workflow( + workflow=mock_workflow, + workflow_result=wf_result, + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + ) + + # Should have per-agent sub_results AND overall + assert "planner" in results[0].sub_results + assert results[0].status == "completed" + # FoundryEvals.evaluate called twice: once for planner, once for overall + assert mock_oai.evals.create.call_count == 2 + + async def test_no_result_or_queries_raises(self) -> None: + mock_oai = MagicMock() + mock_workflow = MagicMock() + + with pytest.raises(ValueError, match="Provide either"): + await evaluate_workflow( + workflow=mock_workflow, + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + ) + + async def test_per_agent_only(self) -> None: + """include_overall=False skips the overall eval.""" + mock_oai = self._mock_oai_client() + + aer = _make_agent_exec_response("agent-a", "Done", ["Do stuff"]) + + events = [ + WorkflowEvent.executor_invoked("agent-a", "Do stuff"), + WorkflowEvent.executor_completed("agent-a", [aer]), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + + results = await evaluate_workflow( + workflow=mock_workflow, + workflow_result=wf_result, + evaluators=FoundryEvals(client=mock_oai, model="gpt-4o"), + include_overall=False, + ) + + assert "agent-a" in results[0].sub_results + # Only one eval call (per-agent), no overall + assert mock_oai.evals.create.call_count == 1 + + async def test_overall_eval_excludes_tool_evaluators(self) -> None: + """Tool evaluators should not be passed to the overall workflow eval.""" + mock_oai = self._mock_oai_client() + + aer = _make_agent_exec_response("researcher", "Weather is sunny", ["What's the weather?"]) + + events = [ + WorkflowEvent.executor_invoked("input-conversation", "What's the weather?"), + WorkflowEvent.executor_completed("input-conversation", None), + WorkflowEvent.executor_invoked("researcher", "What's the weather?"), + WorkflowEvent.executor_completed("researcher", [aer]), + WorkflowEvent.output("end", [Message("assistant", ["Weather is sunny"])]), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + + fe = FoundryEvals( + client=mock_oai, + model="gpt-4o", + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY], + ) + + await evaluate_workflow( + workflow=mock_workflow, + workflow_result=wf_result, + evaluators=fe, + ) + + # Should have 2 evals: one per-agent, one overall + assert mock_oai.evals.create.call_count == 2 + + # Check the overall eval's testing_criteria doesn't include tool_call_accuracy + overall_call = mock_oai.evals.create.call_args_list[-1] + overall_criteria = overall_call.kwargs["testing_criteria"] + evaluator_names = [c["evaluator_name"] for c in overall_criteria] + assert "builtin.tool_call_accuracy" not in evaluator_names + assert "builtin.relevance" in evaluator_names + + async def test_per_agent_excludes_tool_evaluators_when_no_tools(self) -> None: + """Sub-agents without tools should not get tool evaluators.""" + mock_oai = self._mock_oai_client() + + # researcher has tools, planner does not + aer1 = _make_agent_exec_response("researcher", "Weather is sunny", ["Check weather"]) + aer2 = _make_agent_exec_response("planner", "Trip planned", ["Plan based on: sunny"]) + + events = [ + WorkflowEvent.executor_invoked("researcher", "Check weather"), + WorkflowEvent.executor_completed("researcher", [aer1]), + WorkflowEvent.executor_invoked("planner", "Plan based on: sunny"), + WorkflowEvent.executor_completed("planner", [aer2]), + ] + wf_result = WorkflowRunResult(events, []) + + from agent_framework import AgentExecutor + + # researcher has tools + mock_researcher = MagicMock() + mock_researcher.default_options = { + "tools": [ + FunctionTool(name="get_weather", description="Get weather", func=lambda: None), + ] + } + mock_researcher_executor = MagicMock(spec=AgentExecutor) + mock_researcher_executor.agent = mock_researcher + + # planner has NO tools + mock_planner = MagicMock() + mock_planner.default_options = {"tools": []} + mock_planner_executor = MagicMock(spec=AgentExecutor) + mock_planner_executor.agent = mock_planner + + mock_workflow = MagicMock() + mock_workflow.executors = { + "researcher": mock_researcher_executor, + "planner": mock_planner_executor, + } + + fe = FoundryEvals( + client=mock_oai, + model="gpt-4o", + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY], + ) + + await evaluate_workflow( + workflow=mock_workflow, + workflow_result=wf_result, + evaluators=fe, + include_overall=False, + ) + + # Two sub-agent evals + assert mock_oai.evals.create.call_count == 2 + + # Find which call is for researcher vs planner by eval name + for call in mock_oai.evals.create.call_args_list: + criteria = call.kwargs["testing_criteria"] + eval_names = [c["evaluator_name"] for c in criteria] + name = call.kwargs["name"] + if "planner" in name: + assert "builtin.tool_call_accuracy" not in eval_names, ( + "planner has no tools — should not get tool_call_accuracy" + ) + elif "researcher" in name: + assert "builtin.tool_call_accuracy" in eval_names, ( + "researcher has tools — should get tool_call_accuracy" + ) + + +# --------------------------------------------------------------------------- +# EvalItemResult and EvalScoreResult +# --------------------------------------------------------------------------- + + +class TestEvalItemResult: + def test_status_properties(self) -> None: + from agent_framework._evaluation import EvalItemResult + + passed = EvalItemResult(item_id="1", status="pass") + assert passed.is_passed + assert not passed.is_failed + assert not passed.is_error + + failed = EvalItemResult(item_id="2", status="fail") + assert not failed.is_passed + assert failed.is_failed + assert not failed.is_error + + errored = EvalItemResult(item_id="3", status="error") + assert not errored.is_passed + assert not errored.is_failed + assert errored.is_error + + errored2 = EvalItemResult(item_id="4", status="errored") + assert errored2.is_error + + def test_with_scores(self) -> None: + from agent_framework._evaluation import EvalItemResult, EvalScoreResult + + scores = [ + EvalScoreResult(name="relevance", score=0.9, passed=True), + EvalScoreResult(name="coherence", score=0.3, passed=False), + ] + item = EvalItemResult(item_id="1", status="fail", scores=scores) + assert len(item.scores) == 2 + assert item.scores[0].passed is True + assert item.scores[1].passed is False + + def test_with_error(self) -> None: + from agent_framework._evaluation import EvalItemResult + + item = EvalItemResult( + item_id="1", + status="error", + error_code="QueryExtractionError", + error_message="Query list cannot be empty", + ) + assert item.is_error + assert item.error_code == "QueryExtractionError" + + def test_with_token_usage(self) -> None: + from agent_framework._evaluation import EvalItemResult + + item = EvalItemResult( + item_id="1", + status="pass", + token_usage={"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150}, + ) + assert item.token_usage is not None + assert item.token_usage["total_tokens"] == 150 + + +class TestEvalResultsWithItems: + def test_item_status_properties(self) -> None: + from agent_framework._evaluation import EvalItemResult + + results = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 2, "failed": 1, "errored": 1}, + items=[ + EvalItemResult(item_id="1", status="pass"), + EvalItemResult(item_id="2", status="pass"), + EvalItemResult(item_id="3", status="fail"), + EvalItemResult(item_id="4", status="error", error_code="QueryExtractionError"), + ], + ) + assert sum(1 for i in results.items if i.is_passed) == 2 + assert sum(1 for i in results.items if i.is_failed) == 1 + assert sum(1 for i in results.items if i.is_error) == 1 + + def test_raise_for_status_includes_errored_items(self) -> None: + from agent_framework._evaluation import EvalItemResult + + results = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 0, "failed": 0, "errored": 2}, + items=[ + EvalItemResult(item_id="i1", status="error", error_code="QueryExtractionError"), + EvalItemResult(item_id="i2", status="error", error_code="TimeoutError"), + ], + ) + with pytest.raises(EvalNotPassedError, match="Errored items: i1: QueryExtractionError"): + results.raise_for_status() + + +# --------------------------------------------------------------------------- +# _fetch_output_items +# --------------------------------------------------------------------------- + + +class TestFetchOutputItems: + async def test_fetches_and_converts_output_items(self) -> None: + from agent_framework_foundry._foundry_evals import _fetch_output_items + + # Build mock output items matching the OpenAI SDK schema + mock_result = MagicMock() + mock_result.name = "relevance" + mock_result.score = 0.85 + mock_result.passed = True + mock_result.sample = None + + mock_usage = MagicMock() + mock_usage.prompt_tokens = 100 + mock_usage.completion_tokens = 50 + mock_usage.total_tokens = 150 + mock_usage.cached_tokens = 0 + + mock_input = MagicMock() + mock_input.role = "user" + mock_input.content = "What is the weather?" + + mock_output = MagicMock() + mock_output.role = "assistant" + mock_output.content = "It is sunny." + + mock_error = MagicMock() + mock_error.code = "" + mock_error.message = "" + + mock_sample = MagicMock() + mock_sample.error = mock_error + mock_sample.usage = mock_usage + mock_sample.input = [mock_input] + mock_sample.output = [mock_output] + + mock_oi = MagicMock() + mock_oi.id = "oi_abc123" + mock_oi.status = "pass" + mock_oi.results = [mock_result] + mock_oi.sample = mock_sample + mock_oi.datasource_item = {"resp_id": "resp_xyz"} + + mock_client = MagicMock() + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi])) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + + assert len(items) == 1 + item = items[0] + assert item.item_id == "oi_abc123" + assert item.status == "pass" + assert item.is_passed + assert len(item.scores) == 1 + assert item.scores[0].name == "relevance" + assert item.scores[0].score == 0.85 + assert item.scores[0].passed is True + assert item.response_id == "resp_xyz" + assert item.input_text == "What is the weather?" + assert item.output_text == "It is sunny." + assert item.token_usage is not None + assert item.token_usage["total_tokens"] == 150 + assert item.error_code is None + + async def test_handles_errored_item(self) -> None: + from agent_framework_foundry._foundry_evals import _fetch_output_items + + mock_error = MagicMock() + mock_error.code = "QueryExtractionError" + mock_error.message = "Query list cannot be empty" + + mock_sample = MagicMock() + mock_sample.error = mock_error + mock_sample.usage = None + mock_sample.input = [] + mock_sample.output = [] + + mock_oi = MagicMock() + mock_oi.id = "oi_err1" + mock_oi.status = "error" + mock_oi.results = [] + mock_oi.sample = mock_sample + mock_oi.datasource_item = {} + + mock_client = MagicMock() + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_oi])) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + + assert len(items) == 1 + item = items[0] + assert item.is_error + assert item.error_code == "QueryExtractionError" + assert item.error_message == "Query list cannot be empty" + assert len(item.scores) == 0 + + async def test_handles_api_failure_gracefully(self) -> None: + from agent_framework_foundry._foundry_evals import _fetch_output_items + + mock_client = MagicMock() + mock_client.evals.runs.output_items.list = AsyncMock(side_effect=TypeError("API error")) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + assert items == [] + + +# --------------------------------------------------------------------------- +# _poll_eval_run — timeout / failed / canceled paths +# --------------------------------------------------------------------------- + + +class TestPollEvalRun: + async def test_timeout_returns_timeout_status(self) -> None: + """Poll timeout returns EvalResults with status='timeout'.""" + from agent_framework_foundry._foundry_evals import _poll_eval_run + + mock_client = MagicMock() + mock_pending = MagicMock() + mock_pending.status = "queued" + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_pending) + + results = await _poll_eval_run(mock_client, "eval_1", "run_1", poll_interval=0.01, timeout=0.05) + assert results.status == "timeout" + assert results.eval_id == "eval_1" + assert results.run_id == "run_1" + + async def test_failed_run_returns_error(self) -> None: + """Failed run returns EvalResults with error message.""" + from agent_framework_foundry._foundry_evals import _poll_eval_run + + mock_client = MagicMock() + mock_failed = MagicMock() + mock_failed.status = "failed" + mock_failed.error = "Model deployment unavailable" + mock_failed.result_counts = None + mock_failed.report_url = None + mock_failed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_failed) + + results = await _poll_eval_run(mock_client, "eval_1", "run_1", poll_interval=0.01, timeout=5.0) + assert results.status == "failed" + assert results.error == "Model deployment unavailable" + assert results.items == [] + + async def test_canceled_run_returns_canceled_status(self) -> None: + """Canceled run returns EvalResults with status='canceled'.""" + from agent_framework_foundry._foundry_evals import _poll_eval_run + + mock_client = MagicMock() + mock_canceled = MagicMock() + mock_canceled.status = "canceled" + mock_canceled.error = None + mock_canceled.result_counts = None + mock_canceled.report_url = None + mock_canceled.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_canceled) + + results = await _poll_eval_run(mock_client, "eval_1", "run_1", poll_interval=0.01, timeout=5.0) + assert results.status == "canceled" + assert results.error is None + assert results.items == [] + + +# --------------------------------------------------------------------------- +# evaluate_traces +# --------------------------------------------------------------------------- + + +class TestEvaluateTraces: + async def test_raises_without_required_args(self) -> None: + """Raises ValueError when no response_ids, trace_ids, or agent_id given.""" + from agent_framework_foundry._foundry_evals import evaluate_traces + + mock_client = MagicMock() + with pytest.raises(ValueError, match="Provide at least one of"): + await evaluate_traces( + client=mock_client, + model="gpt-4o", + ) + + async def test_response_ids_path(self) -> None: + """evaluate_traces with response_ids uses the responses API path.""" + from agent_framework_foundry._foundry_evals import evaluate_traces + + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tr" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tr" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = "https://portal.azure.com/eval/run_tr" + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + mock_output_item = MagicMock() + mock_output_item.id = "oi_resp" + mock_output_item.status = "pass" + mock_output_item.sample = MagicMock(error=None, usage=None, input=[], output=[]) + mock_result = MagicMock(status="pass", score=4) + mock_result.name = "relevance" + mock_output_item.results = [mock_result] + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([mock_output_item])) + + results = await evaluate_traces( + response_ids=["resp_abc", "resp_def"], + client=mock_client, + model="gpt-4o", + ) + assert results.status == "completed" + assert results.eval_id == "eval_tr" + assert len(results.items) == 1 + assert results.items[0].item_id == "oi_resp" + + # Verify the response IDs are in the data source + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "azure_ai_responses" + content = ds["item_generation_params"]["source"]["content"] + assert len(content) == 2 + assert content[0]["item"]["resp_id"] == "resp_abc" + + async def test_trace_ids_path(self) -> None: + """evaluate_traces with trace_ids builds azure_ai_traces data source.""" + from agent_framework_foundry._foundry_evals import evaluate_traces + + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tid" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tid" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + results = await evaluate_traces( + trace_ids=["trace_1"], + client=mock_client, + model="gpt-4o", + ) + assert results.status == "completed" + + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "azure_ai_traces" + assert ds["trace_ids"] == ["trace_1"] + + +# --------------------------------------------------------------------------- +# evaluate_foundry_target +# --------------------------------------------------------------------------- + + +class TestEvaluateFoundryTarget: + async def test_happy_path(self) -> None: + """evaluate_foundry_target creates eval + run and polls to completion.""" + from agent_framework_foundry._foundry_evals import evaluate_foundry_target + + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tgt" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tgt" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 2, "failed": 0} + mock_completed.report_url = "https://portal.azure.com/eval/run_tgt" + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + results = await evaluate_foundry_target( + target={"type": "azure_ai_agent", "name": "my-agent"}, + test_queries=["Query 1", "Query 2"], + client=mock_client, + model="gpt-4o", + ) + assert results.status == "completed" + assert results.eval_id == "eval_tgt" + assert results.all_passed + + # Verify the target and queries in data source + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "azure_ai_target_completions" + assert ds["target"]["type"] == "azure_ai_agent" + content = ds["source"]["content"] + assert len(content) == 2 + assert content[0]["item"]["query"] == "Query 1" + + +# --------------------------------------------------------------------------- +# r3 review: _extract_result_counts paths +# --------------------------------------------------------------------------- + + +class TestExtractResultCounts: + """Tests for all _extract_result_counts code paths.""" + + def test_dict_passthrough(self): + """Path 1: result_counts is already a dict.""" + run = MagicMock() + run.result_counts = {"passed": 3, "failed": 1} + assert _extract_result_counts(run) == {"passed": 3, "failed": 1} + + def test_vars_extraction(self): + """Path 2: result_counts is an object with vars().""" + + class Counts: + def __init__(self): + self.passed = 5 + self.failed = 2 + self.label = "info" # non-int, should be filtered + + run = MagicMock() + run.result_counts = Counts() + result = _extract_result_counts(run) + assert result is not None + assert result["passed"] == 5 + assert result["failed"] == 2 + assert "label" not in result + + def test_type_error_fallback(self): + """Path 3: result_counts has no __dict__ (e.g. an int) → None.""" + run = MagicMock() + run.result_counts = 42 # can't call vars() on an int + assert _extract_result_counts(run) is None + + def test_none_result_counts(self): + run = MagicMock() + run.result_counts = None + assert _extract_result_counts(run) is None + + +# --------------------------------------------------------------------------- +# r3 review: _extract_per_evaluator +# --------------------------------------------------------------------------- + + +class TestExtractPerEvaluator: + """Tests for _extract_per_evaluator with mock data.""" + + def test_with_per_testing_criteria_results(self): + """Parses per_testing_criteria_results into per-evaluator breakdown.""" + + @dataclass + class CriteriaItem: + testing_criteria: str + passed: int + failed: int + + run = MagicMock() + run.per_testing_criteria_results = [ + CriteriaItem("relevance", 4, 1), + CriteriaItem("coherence", 5, 0), + ] + result = _extract_per_evaluator(run) + assert "relevance" in result + assert result["relevance"] == {"passed": 4, "failed": 1} + assert "coherence" in result + assert result["coherence"] == {"passed": 5, "failed": 0} + + def test_with_testing_criteria_attr(self): + """Uses testing_criteria field (the real SDK field name).""" + + @dataclass + class CriteriaItem: + testing_criteria: str + passed: int + failed: int + + run = MagicMock() + run.per_testing_criteria_results = [CriteriaItem("fluency", 3, 2)] + result = _extract_per_evaluator(run) + assert "fluency" in result + assert result["fluency"]["passed"] == 3 + + def test_none_per_testing_criteria(self): + run = MagicMock() + run.per_testing_criteria_results = None + assert _extract_per_evaluator(run) == {} + + +# --------------------------------------------------------------------------- +# r3 review: _resolve_openai_client async check +# --------------------------------------------------------------------------- + + +class TestResolveOpenaiClientAsyncCheck: + """Tests for the async client runtime check.""" + + def test_sync_client_raises(self): + """A sync project_client raises TypeError (not an AsyncOpenAI instance).""" + mock_project = MagicMock() + sync_client = MagicMock() # plain MagicMock, not isinstance(AsyncOpenAI) + mock_project.get_openai_client.return_value = sync_client + + with pytest.raises(TypeError, match="sync client"): + _resolve_openai_client(project_client=mock_project) + + +# --------------------------------------------------------------------------- +# r5 review: evaluator set consistency (replaces import-time asserts) +# --------------------------------------------------------------------------- + + +class TestEvaluatorSetConsistency: + """Verify that _AGENT_EVALUATORS and _TOOL_EVALUATORS are subsets of _BUILTIN_EVALUATORS.""" + + def test_agent_evaluators_subset(self): + from agent_framework_foundry._foundry_evals import _AGENT_EVALUATORS, _BUILTIN_EVALUATORS + + diff = _AGENT_EVALUATORS - set(_BUILTIN_EVALUATORS.values()) + assert not diff, f"_AGENT_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}" + + def test_tool_evaluators_subset(self): + from agent_framework_foundry._foundry_evals import _BUILTIN_EVALUATORS, _TOOL_EVALUATORS + + diff = _TOOL_EVALUATORS - set(_BUILTIN_EVALUATORS.values()) + assert not diff, f"_TOOL_EVALUATORS has names not in _BUILTIN_EVALUATORS: {diff}" + + +# --------------------------------------------------------------------------- +# r5 review: evaluate_traces with agent_id only +# --------------------------------------------------------------------------- + + +class TestEvaluateTracesAgentId: + async def test_agent_id_only_path(self) -> None: + """evaluate_traces with agent_id only builds azure_ai_traces data source.""" + from agent_framework_foundry._foundry_evals import evaluate_traces + + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_aid" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_aid" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 2, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + mock_client.evals.runs.output_items.list = AsyncMock(return_value=_AsyncPage([])) + + results = await evaluate_traces( + agent_id="my-agent", + client=mock_client, + model="gpt-4o", + lookback_hours=24, + ) + assert results.status == "completed" + + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "azure_ai_traces" + assert ds["agent_id"] == "my-agent" + assert ds["lookback_hours"] == 24 + assert "trace_ids" not in ds + + +# --------------------------------------------------------------------------- +# r5 review: _filter_tool_evaluators raises ValueError +# --------------------------------------------------------------------------- + + +class TestFilterToolEvaluatorsRaises: + def test_all_tool_evaluators_no_tools_raises(self): + """All tool evaluators + no items with tools → ValueError.""" + items = [EvalItem(conversation=[Message("user", ["Hi"]), Message("assistant", ["Hello"])])] + with pytest.raises(ValueError, match="require tool definitions"): + _filter_tool_evaluators(["builtin.tool_call_accuracy", "builtin.tool_selection"], items) + + +# --------------------------------------------------------------------------- +# r5 review: evaluate_foundry_target validates target dict +# --------------------------------------------------------------------------- + + +class TestEvaluateFoundryTargetValidation: + async def test_target_without_type_raises(self) -> None: + """target dict without 'type' key raises ValueError.""" + from agent_framework_foundry._foundry_evals import evaluate_foundry_target + + mock_client = MagicMock() + with pytest.raises(ValueError, match="'type' key"): + await evaluate_foundry_target( + target={"name": "my-agent"}, # missing "type" + test_queries=["Hello"], + client=mock_client, + model="gpt-4o", + ) diff --git a/python/samples/02-agents/evaluation/evaluate_agent.py b/python/samples/02-agents/evaluation/evaluate_agent.py new file mode 100644 index 0000000000..ac28520291 --- /dev/null +++ b/python/samples/02-agents/evaluation/evaluate_agent.py @@ -0,0 +1,81 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate an agent with local checks — no API keys needed. + +Demonstrates the simplest evaluation workflow: +1. Define checks using the @evaluator decorator +2. Run evaluate_agent() which calls agent.run() under the covers +3. Assert results in CI or inspect interactively + +Usage: + uv run python samples/02-agents/evaluation/evaluate_agent.py +""" + +import asyncio +import os + +from agent_framework import ( + Agent, + LocalEvaluator, + evaluate_agent, + evaluator, + keyword_check, +) +from agent_framework.foundry import FoundryChatClient +from azure.identity import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +# A custom check — parameter names determine what data you receive +@evaluator +def is_helpful(response: str) -> bool: + """Check the response isn't empty or a refusal.""" + refusals = ["i can't", "i'm not able", "i don't know"] + return len(response) > 10 and not any(r in response.lower() for r in refusals) + + +async def main() -> None: + client = FoundryChatClient( + project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + model=os.environ.get("FOUNDRY_MODEL", "gpt-4o"), + credential=AzureCliCredential(), + ) + + agent = Agent( + client=client, + name="weather-assistant", + instructions="You are a helpful weather assistant.", + ) + + # Combine built-in and custom checks + local = LocalEvaluator( + keyword_check("weather"), # response must mention "weather" + is_helpful, # custom check + ) + + # evaluate_agent() calls agent.run() for each query, then evaluates + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather like in Seattle?", + "Will it rain in London tomorrow?", + "What should I wear for 30°C weather?", + ], + evaluators=local, + ) + + for r in results: + print(f"{r.provider}: {r.passed}/{r.total} passed") + for item in r.items: + print(f" [{item.status}] Q: {item.input_text[:50]} A: {item.output_text[:50]}...") + for score in item.scores: + print(f" {score.name}: {'✓' if score.passed else '✗'}") + + # Use in CI: will raise EvalNotPassedError if any check fails + # results[0].raise_for_status() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/02-agents/evaluation/evaluate_with_expected.py b/python/samples/02-agents/evaluation/evaluate_with_expected.py new file mode 100644 index 0000000000..0127037f79 --- /dev/null +++ b/python/samples/02-agents/evaluation/evaluate_with_expected.py @@ -0,0 +1,73 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate an agent with expected outputs and tool call checks. + +Demonstrates ground-truth comparison and tool usage evaluation: +1. Provide expected outputs alongside queries +2. Use built-in tool_calls_present for tool verification +3. Combine multiple evaluation criteria + +Usage: + uv run python samples/02-agents/evaluation/evaluate_with_expected.py +""" + +import asyncio +import os + +from agent_framework import ( + Agent, + LocalEvaluator, + evaluate_agent, + evaluator, + tool_calls_present, +) +from agent_framework.foundry import FoundryChatClient +from azure.identity import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +@evaluator +def response_matches_expected(response: str, expected_output: str) -> float: + """Score based on word overlap with expected output.""" + if not expected_output: + return 1.0 + response_words = set(response.lower().split()) + expected_words = set(expected_output.lower().split()) + return len(response_words & expected_words) / max(len(expected_words), 1) + + +async def main() -> None: + client = FoundryChatClient( + project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + model=os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o"), + credential=AzureCliCredential(), + ) + + agent = Agent( + client=client, + name="math-tutor", + instructions="You are a math tutor. Answer concisely.", + ) + + local = LocalEvaluator( + response_matches_expected, + tool_calls_present, # verifies expected tools were called + ) + + results = await evaluate_agent( + agent=agent, + queries=["What is 2 + 2?", "What is the square root of 144?"], + expected_output=["4", "12"], + evaluators=local, + ) + + for r in results: + print(f"{r.provider}: {r.passed}/{r.total} passed") + for item in r.items: + print(f" [{item.status}] {item.input_text} → {item.output_text[:80]}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/03-workflows/evaluation/evaluate_workflow.py b/python/samples/03-workflows/evaluation/evaluate_workflow.py new file mode 100644 index 0000000000..2a44182c3d --- /dev/null +++ b/python/samples/03-workflows/evaluation/evaluate_workflow.py @@ -0,0 +1,69 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate a multi-agent workflow with per-agent breakdown. + +Demonstrates workflow evaluation: +1. Build a simple two-agent workflow +2. Run evaluate_workflow() which runs the workflow and evaluates each agent +3. Inspect per-agent results in sub_results + +Usage: + uv run python samples/03-workflows/evaluation/evaluate_workflow.py +""" + +import asyncio +import os + +from agent_framework import ( + Agent, + LocalEvaluator, + WorkflowBuilder, + evaluate_workflow, + evaluator, + keyword_check, +) +from agent_framework.foundry import FoundryChatClient +from azure.identity import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +@evaluator +def is_nonempty(response: str) -> bool: + """Check the agent produced a non-trivial response.""" + return len(response.strip()) > 5 + + +async def main() -> None: + # Build a simple planner → executor workflow + client = FoundryChatClient( + project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + model=os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o"), + credential=AzureCliCredential(), + ) + planner = Agent(client=client, name="planner", instructions="You plan trips. Output a bullet-point plan.") + executor_agent = Agent( + client=client, name="executor", instructions="You execute travel plans. Book the items listed." + ) + + workflow = WorkflowBuilder(start_executor=planner).add_edge(planner, executor_agent).build() + + # Evaluate with per-agent breakdown + local = LocalEvaluator(is_nonempty, keyword_check("plan", "trip")) + + results = await evaluate_workflow( + workflow=workflow, + queries=["Plan a weekend trip to Paris"], + evaluators=local, + ) + + for r in results: + print(f"{r.provider}: {r.passed}/{r.total} passed (overall)") + for agent_name, sub in r.sub_results.items(): + error = f" (error: {sub.error})" if sub.error else "" + print(f" {agent_name}: {sub.passed}/{sub.total} {error}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example new file mode 100644 index 0000000000..b6a8af233e --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example @@ -0,0 +1,3 @@ +FOUNDRY_PROJECT_ENDPOINT="" +FOUNDRY_MODEL="" + diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md new file mode 100644 index 0000000000..81412a7f0e --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md @@ -0,0 +1,46 @@ +# Foundry Evals Integration Samples + +These samples demonstrate evaluating agent-framework agents using Azure AI Foundry's built-in evaluators. + +## Available Evaluators + +| Category | Evaluators | +|----------|-----------| +| **Agent behavior** | `intent_resolution`, `task_adherence`, `task_completion`, `task_navigation_efficiency` | +| **Tool usage** | `tool_call_accuracy`, `tool_selection`, `tool_input_accuracy`, `tool_output_utilization`, `tool_call_success` | +| **Quality** | `coherence`, `fluency`, `relevance`, `groundedness`, `response_completeness`, `similarity` | +| **Safety** | `violence`, `sexual`, `self_harm`, `hate_unfairness` | + +## Samples + +### `evaluate_agent_sample.py` — Dataset Evaluation (Path 3) + +The dev inner loop. Two patterns from simplest to most control: + +1. **`evaluate_agent()`** — One call: runs agent → converts → evaluates +2. **`FoundryEvals.evaluate()`** — Run agent yourself, convert with `AgentEvalConverter`, inspect/modify, then evaluate + +```bash +uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py +``` + +### `evaluate_traces_sample.py` — Trace & Response Evaluation (Path 1) + +Evaluate what already happened — zero changes to agent code: + +1. **`evaluate_traces(response_ids=...)`** — Evaluate Responses API responses by ID +2. **`evaluate_traces(agent_id=...)`** — Evaluate agent behavior from OTel traces in App Insights + +```bash +uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py +``` + +## Setup + +Create a `.env` file with configuration as in the `.env.example` file in this folder. + +## Which sample should I start with? + +- **"I want to test my agent during development"** → `evaluate_agent_sample.py`, Pattern 1 +- **"I want to evaluate past agent runs"** → `evaluate_traces_sample.py` +- **"I want to inspect/modify eval data before submitting"** → `evaluate_agent_sample.py`, Pattern 2 diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py new file mode 100644 index 0000000000..94680d80a2 --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py @@ -0,0 +1,159 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate an agent using Azure AI Foundry's built-in evaluators. + +This sample demonstrates two patterns: +1. evaluate_agent(responses=...) — Evaluate a response you already have. +2. evaluate_agent(queries=...) — Run the agent against test queries and evaluate in one call. + +See ``evaluate_tool_calls_sample.py`` for tool-call accuracy evaluation. + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Set FOUNDRY_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework import Agent, ConversationSplit, evaluate_agent +from agent_framework.foundry import FoundryChatClient, FoundryEvals +from azure.ai.projects.aio import AIProjectClient +from azure.identity.aio import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +# Define a simple tool for the agent +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + weather_data = { + "seattle": "62°F, cloudy with a chance of rain", + "london": "55°F, overcast", + "paris": "68°F, partly sunny", + } + return weather_data.get(location.lower(), f"Weather data not available for {location}") + + +def get_flight_price(origin: str, destination: str) -> str: + """Get the price of a flight between two cities.""" + return f"Flights from {origin} to {destination}: $450 round-trip" + + +async def main() -> None: + # 1. Set up the Azure AI project client + project_client = AIProjectClient( + endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + credential=AzureCliCredential(), + ) + + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + chat_client = FoundryChatClient(project_client=project_client, model=deployment) + + # 2. Create an agent with tools + agent = Agent( + client=chat_client, + name="travel-assistant", + instructions=( + "You are a helpful travel assistant. Use your tools to answer questions about weather and flights." + ), + tools=[get_weather, get_flight_price], + ) + + # 3. Create the evaluator — provider config goes here, once + evals = FoundryEvals(client=chat_client, model=deployment) + + # ========================================================================= + # Pattern 1: evaluate_agent(responses=...) — evaluate a response you already have + # ========================================================================= + print("=" * 60) + print("Pattern 1: evaluate_agent(responses=...) — evaluate existing response") + print("=" * 60) + + query = "How much does a flight from Seattle to Paris cost?" + response = await agent.run(query) + print(f"Agent said: {response.text[:100]}...") + + # Pass agent= so tool definitions are extracted, queries= for the eval item context + results = await evaluate_agent( + agent=agent, + responses=response, + queries=[query], + evaluators=FoundryEvals( + client=chat_client, + model=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY], + ), + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("✓ All passed") + else: + print(f"✗ {r.failed} failed") + + # ========================================================================= + # Pattern 2a: evaluate_agent() — batch test queries + # ========================================================================= + print() + print("=" * 60) + print("Pattern 2a: evaluate_agent()") + print("=" * 60) + + # Calls agent.run() under the covers for each query, then evaluates + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather like in Seattle?", + "How much does a flight from Seattle to Paris cost?", + "What should I pack for London?", + ], + evaluators=evals, # uses smart defaults (auto-adds tool_call_accuracy) + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("✓ All passed") + else: + print(f"✗ {r.failed} failed") + + # ========================================================================= + # Pattern 2b: evaluate_agent() — with conversation split override + # ========================================================================= + print() + print("=" * 60) + print("Pattern 2b: evaluate_agent() with conversation_split") + print("=" * 60) + + # conversation_split forces all evaluators to use the same split strategy. + # FULL evaluates the entire conversation trajectory against the original query. + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather like in Seattle?", + "What should I pack for London?", + ], + evaluators=evals, + conversation_split=ConversationSplit.FULL, # overrides evaluator defaults + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("✓ All passed") + else: + print(f"✗ {r.failed} failed") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_mixed_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_mixed_sample.py new file mode 100644 index 0000000000..4f5288ea5a --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_mixed_sample.py @@ -0,0 +1,163 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Mix local and cloud evaluation providers in a single evaluate_agent() call. + +This sample demonstrates three patterns: +1. Local-only: Fast, API-free checks for inner-loop development. +2. Cloud-only: Full Foundry evaluators for comprehensive quality assessment. +3. Mixed: Local + Foundry evaluators in a single evaluate_agent() call. + +Mixing lets you get instant local feedback (keyword presence, tool usage) +alongside deeper cloud-based quality evaluation (relevance, coherence) +in one call. + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Set FOUNDRY_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework import ( + Agent, + LocalEvaluator, + evaluate_agent, + keyword_check, + tool_called_check, +) +from agent_framework.foundry import FoundryChatClient, FoundryEvals +from azure.ai.projects.aio import AIProjectClient +from azure.identity.aio import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +# Define a simple tool for the agent +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + weather_data = { + "seattle": "62°F, cloudy with a chance of rain", + "london": "55°F, overcast", + "paris": "68°F, partly sunny", + } + return weather_data.get(location.lower(), f"Weather data not available for {location}") + + +async def main() -> None: + # 1. Set up the Azure AI project client + project_client = AIProjectClient( + endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + credential=AzureCliCredential(), + ) + + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + chat_client = FoundryChatClient(project_client=project_client, model=deployment) + + # 2. Create an agent with a tool + agent = Agent( + client=chat_client, + name="weather-assistant", + instructions="You are a helpful weather assistant. Use the get_weather tool to answer questions.", + tools=[get_weather], + ) + + # ========================================================================= + # Pattern 1: Local evaluation only (no API calls, instant results) + # ========================================================================= + print("=" * 60) + print("Pattern 1: Local evaluation only") + print("=" * 60) + + local = LocalEvaluator( + keyword_check("weather", "seattle"), + tool_called_check("get_weather"), + ) + + results = await evaluate_agent( + agent=agent, + queries=["What's the weather in Seattle?"], + evaluators=local, + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + for check_name, counts in r.per_evaluator.items(): + print(f" {check_name}: {counts['passed']} passed, {counts['failed']} failed") + if r.all_passed: + print("✓ All local checks passed!") + else: + print(f"✗ Failures: {r.error}") + + # ========================================================================= + # Pattern 2: Foundry evaluation only (cloud-based quality assessment) + # ========================================================================= + print() + print("=" * 60) + print("Pattern 2: Foundry evaluation only") + print("=" * 60) + + foundry = FoundryEvals(client=chat_client, model=deployment) + + results = await evaluate_agent( + agent=agent, + queries=["What's the weather in Seattle?"], + evaluators=foundry, + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("✓ All passed") + else: + print(f"✗ {r.failed} failed") + + # ========================================================================= + # Pattern 3: Mixed — local + Foundry in one call + # ========================================================================= + print() + print("=" * 60) + print("Pattern 3: Mixed local + Foundry evaluation") + print("=" * 60) + + # Local checks: fast smoke tests + local = LocalEvaluator( + keyword_check("weather"), + tool_called_check("get_weather"), + ) + + # Foundry: deep quality assessment + foundry = FoundryEvals(client=chat_client, model=deployment) + + # Pass both as a list — returns one EvalResults per provider + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather in Seattle?", + "Tell me the weather in London", + ], + evaluators=[local, foundry], + ) + + for r in results: + status = "✓" if r.all_passed else "✗" + print(f" {status} {r.provider}: {r.passed}/{r.total} passed") + for check_name, counts in r.per_evaluator.items(): + print(f" {check_name}: {counts['passed']}/{counts['passed'] + counts['failed']}") + if r.report_url: + print(f" Portal: {r.report_url}") + + if all(r.all_passed for r in results): + print("✓ All checks passed (local + Foundry)!") + else: + failed = [r.provider for r in results if not r.all_passed] + print(f"✗ Failed providers: {', '.join(failed)}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_multiturn_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_multiturn_sample.py new file mode 100644 index 0000000000..e0a791ba10 --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_multiturn_sample.py @@ -0,0 +1,188 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate multi-turn conversations with different split strategies. + +The same multi-turn conversation can be split different ways, each evaluating +a different aspect of agent behavior: + +1. LAST_TURN (default) — "Was the last response good given context?" +2. FULL — "Did the whole conversation serve the original request?" +3. per_turn_items — "Was each individual response appropriate?" + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Set FOUNDRY_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework import Content, ConversationSplit, EvalItem, FunctionTool, Message +from agent_framework.foundry import FoundryChatClient, FoundryEvals +from azure.ai.projects.aio import AIProjectClient +from azure.identity.aio import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + +# A multi-turn conversation with tool calls that we'll evaluate three ways. +# Uses framework Message/Content types for type-safe conversation construction. +CONVERSATION: list[Message] = [ + # Turn 1: user asks about weather → agent calls tool → responds + Message("user", ["What's the weather in Seattle?"]), + Message( + "assistant", + [ + Content.from_function_call("c1", "get_weather", arguments={"location": "seattle"}), + ], + ), + Message( + "tool", + [ + Content.from_function_result("c1", result="62°F, cloudy with a chance of rain"), + ], + ), + Message("assistant", ["Seattle is 62°F, cloudy with a chance of rain."]), + # Turn 2: user asks about Paris → agent calls tool → responds + Message("user", ["And Paris?"]), + Message( + "assistant", + [ + Content.from_function_call("c2", "get_weather", arguments={"location": "paris"}), + ], + ), + Message( + "tool", + [ + Content.from_function_result("c2", result="68°F, partly sunny"), + ], + ), + Message("assistant", ["Paris is 68°F, partly sunny."]), + # Turn 3: user asks for comparison → agent synthesizes without tool + Message("user", ["Can you compare them?"]), + Message( + "assistant", + [ + ( + "Seattle is cooler at 62°F with rain likely, while Paris is warmer " + "at 68°F and partly sunny. Paris is the better choice for outdoor activities." + ), + ], + ), +] + +TOOLS = [ + FunctionTool( + name="get_weather", + description="Get the current weather for a location.", + ), +] + + +def print_split(item: EvalItem, split: ConversationSplit = ConversationSplit.LAST_TURN) -> None: + """Print the query/response split for an EvalItem.""" + query_msgs, response_msgs = item.split_messages(split) + print(f" query_messages ({len(query_msgs)}):") + for m in query_msgs: + text = m.text or "" + print(f" {m.role}: {text[:70]}") + print(f" response_messages ({len(response_msgs)}):") + for m in response_msgs: + text = m.text or "" + print(f" {m.role}: {text[:70]}") + + +async def main() -> None: + project_client = AIProjectClient( + endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + credential=AzureCliCredential(), + ) + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + chat_client = FoundryChatClient(project_client=project_client, model=deployment) + + # ========================================================================= + # Strategy 1: LAST_TURN (default) + # "Given all context, was the last response good?" + # ========================================================================= + print("=" * 70) + print("Strategy 1: LAST_TURN — evaluate the final response") + print("=" * 70) + + # EvalItem takes conversation + tools; query/response are derived via split strategy + item = EvalItem(CONVERSATION, tools=TOOLS) + + print_split(item, ConversationSplit.LAST_TURN) + + results = await FoundryEvals( + client=chat_client, + model=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.COHERENCE], + # conversation_split defaults to LAST_TURN + ).evaluate([item], eval_name="Split Strategy: LAST_TURN") + + print(f"\n Result: {results.passed}/{results.total} passed") + print(f" Portal: {results.report_url}") + for ir in results.items: + for s in ir.scores: + print(f" {'✓' if s.passed else '✗'} {s.name}: {s.score}") + print() + + # ========================================================================= + # Strategy 2: FULL + # "Given the original request, did the whole conversation serve the user?" + # ========================================================================= + print("=" * 70) + print("Strategy 2: FULL — evaluate the entire conversation trajectory") + print("=" * 70) + + print_split(item, ConversationSplit.FULL) + + results = await FoundryEvals( + client=chat_client, + model=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.COHERENCE], + conversation_split=ConversationSplit.FULL, + ).evaluate([item], eval_name="Split Strategy: FULL") + + print(f"\n Result: {results.passed}/{results.total} passed") + print(f" Portal: {results.report_url}") + for ir in results.items: + for s in ir.scores: + print(f" {'✓' if s.passed else '✗'} {s.name}: {s.score}") + print() + + # ========================================================================= + # Strategy 3: per_turn_items + # "Was each individual response appropriate at that point?" + # ========================================================================= + print("=" * 70) + print("Strategy 3: per_turn_items — evaluate each turn independently") + print("=" * 70) + + items = EvalItem.per_turn_items(CONVERSATION, tools=TOOLS) + print(f" Split into {len(items)} items from {len(CONVERSATION)} messages:\n") + for i, it in enumerate(items): + print(f" Turn {i + 1}: query={it.query!r}, response={it.response[:60]!r}...") + print() + + results = await FoundryEvals( + client=chat_client, + model=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.COHERENCE], + ).evaluate(items, eval_name="Split Strategy: Per-Turn") + + print(f"\n Result: {results.passed}/{results.total} passed ({len(items)} items × 2 evaluators)") + print(f" Portal: {results.report_url}") + for ir in results.items: + for s in ir.scores: + print(f" {'✓' if s.passed else '✗'} {s.name}: {s.score}") + print() + + print("=" * 70) + print("All strategies complete. Compare results in the Foundry portal.") + print("=" * 70) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_tool_calls_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_tool_calls_sample.py new file mode 100644 index 0000000000..858957b5c1 --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_tool_calls_sample.py @@ -0,0 +1,93 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate tool-calling accuracy using Azure AI Foundry's TOOL_CALL_ACCURACY evaluator. + +This sample demonstrates evaluating how well an agent selects and invokes tools +by using ``FoundryEvals.evaluate()`` with ``TOOL_CALL_ACCURACY``. + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Set FOUNDRY_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework import Agent, AgentEvalConverter +from agent_framework.foundry import FoundryChatClient, FoundryEvals +from azure.ai.projects.aio import AIProjectClient +from azure.identity.aio import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + weather_data = { + "seattle": "62°F, cloudy with a chance of rain", + "london": "55°F, overcast", + "paris": "68°F, partly sunny", + } + return weather_data.get(location.lower(), f"Weather data not available for {location}") + + +def get_flight_price(origin: str, destination: str) -> str: + """Get the price of a flight between two cities.""" + return f"Flights from {origin} to {destination}: $450 round-trip" + + +async def main() -> None: + project_client = AIProjectClient( + endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + credential=AzureCliCredential(), + ) + + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + chat_client = FoundryChatClient(project_client=project_client, model=deployment) + + # Create an agent with tools + agent = Agent( + client=chat_client, + name="travel-assistant", + instructions=( + "You are a helpful travel assistant. " + "Use your tools to answer questions about weather and flights." + ), + tools=[get_weather, get_flight_price], + ) + + # Run the agent and convert responses to eval items + queries = [ + "What's the weather in Paris?", + "Find me a flight from London to Seattle", + ] + + items = [] + for q in queries: + response = await agent.run(q) + print(f"Query: {q}") + print(f"Response: {response.text[:100]}...") + + item = AgentEvalConverter.to_eval_item(query=q, response=response, agent=agent) + items.append(item) + + print(f" Has tools: {item.tools is not None}") + if item.tools: + print(f" Tools: {[t.name for t in item.tools]}") + + # Submit to Foundry with tool_call_accuracy evaluator + evals = FoundryEvals( + client=chat_client, + model=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY], + ) + results = await evals.evaluate(items, eval_name="Tool Call Accuracy Eval") + + print(f"\nStatus: {results.status}") + print(f"Results: {results.passed}/{results.total} passed") + print(f"Portal: {results.report_url}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py new file mode 100644 index 0000000000..a563d14bff --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py @@ -0,0 +1,122 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate agent responses that already exist in Foundry (zero-code-change). + +This sample demonstrates two patterns: +1. evaluate_traces(response_ids=...) — Evaluate specific Responses API responses by ID. +2. evaluate_traces(agent_id=...) — Evaluate agent behavior from OTel traces in App Insights. + +These are the "zero-code-change" evaluation paths — the agent has already run, +and you're evaluating what happened after the fact. + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Response IDs from prior agent runs (for Pattern 1) +- OTel traces exported to App Insights (for Pattern 2) +- Set FOUNDRY_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework.foundry import FoundryChatClient, FoundryEvals, evaluate_traces +from azure.ai.projects.aio import AIProjectClient +from azure.identity.aio import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +async def main() -> None: + # 1. Set up the Azure AI project client + project_client = AIProjectClient( + endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + credential=AzureCliCredential(), + ) + + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + chat_client = FoundryChatClient(project_client=project_client, model=deployment) + + # ========================================================================= + # Pattern 1: evaluate_traces(response_ids=...) — By response ID + # ========================================================================= + # If your agent uses the Responses API (e.g., FoundryChatClient), + # each run produces a response_id. Pass those IDs to evaluate_traces() + # and Foundry retrieves the full conversation for evaluation. + print("=" * 60) + print("Pattern 1: evaluate_traces(response_ids=...)") + print("=" * 60) + + # Replace these with actual response IDs from your agent runs + response_ids = [ + "resp_abc123", + "resp_def456", + ] + + results = await evaluate_traces( + response_ids=response_ids, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.GROUNDEDNESS, FoundryEvals.TOOL_CALL_ACCURACY], + client=chat_client, + model=deployment, + ) + + print(f"Status: {results.status}") + print(f"Results: {results.result_counts}") + print(f"Portal: {results.report_url}") + + # ========================================================================= + # Pattern 2: evaluate_traces(agent_id=...) — From App Insights + # ========================================================================= + # If your agent emits OTel traces to App Insights (via configure_otel_providers), + # you can evaluate recent activity without specifying individual response IDs. + # + # NOTE: Requires OTel traces exported to the App Insights instance connected + # to your Foundry project. The exact trace-based data source API is subject + # to change as Foundry evolves. + print() + print("=" * 60) + print("Pattern 2: evaluate_traces(agent_id=...)") + print("=" * 60) + + # Evaluate by response IDs (uses response-based data source internally) + results = await evaluate_traces( + response_ids=response_ids, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.COHERENCE], + client=chat_client, + model=deployment, + ) + + print(f"Status: {results.status}") + print(f"Portal: {results.report_url}") + + # Evaluate by agent ID + time window (when trace-based API is available) + # results = await evaluate_traces( + # agent_id="travel-bot", + # evaluators=[FoundryEvals.INTENT_RESOLUTION, FoundryEvals.TASK_ADHERENCE], + # client=chat_client, + # model=deployment, + # lookback_hours=24, + # ) + + +if __name__ == "__main__": + asyncio.run(main()) + + +""" +Sample output (with actual Azure AI Foundry project and valid response IDs): + +============================================================ +Pattern 1: evaluate_traces(response_ids=...) +============================================================ +Status: completed +Results: {'passed': 2, 'failed': 0, 'errored': 0} +Portal: https://ai.azure.com/... + +============================================================ +Pattern 2: evaluate_traces(agent_id=...) +============================================================ +Status: completed +Portal: https://ai.azure.com/... +""" diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py new file mode 100644 index 0000000000..fd3bcb7da8 --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py @@ -0,0 +1,181 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate a multi-agent workflow using Azure AI Foundry evaluators. + +This sample demonstrates two patterns: +1. Post-hoc: Run the workflow, then evaluate the result you already have. +2. Run + evaluate: Pass queries and let evaluate_workflow() run the workflow for you. + +Both patterns return a list of results (one per provider), each with a per-agent +breakdown in sub_results so you can identify which agent is underperforming. + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Set FOUNDRY_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework import Agent, evaluate_workflow +from agent_framework.foundry import FoundryChatClient, FoundryEvals +from agent_framework_orchestrations import SequentialBuilder +from azure.ai.projects.aio import AIProjectClient +from azure.identity.aio import AzureCliCredential +from dotenv import load_dotenv + +load_dotenv() + + +# Simple tools for the agents +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + weather_data = { + "seattle": "62°F, cloudy with a chance of rain", + "london": "55°F, overcast", + "paris": "68°F, partly sunny", + } + return weather_data.get(location.lower(), f"Weather data not available for {location}") + + +def get_flight_price(origin: str, destination: str) -> str: + """Get the price of a flight between two cities.""" + return f"Flights from {origin} to {destination}: $450 round-trip" + + +async def main() -> None: + # 1. Set up the Azure AI project client + project_client = AIProjectClient( + endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + credential=AzureCliCredential(), + ) + + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + client = FoundryChatClient(project_client=project_client, model=deployment) + + # 2. Create agents for a sequential workflow + # Use store=False so agents don't chain conversation state via previous_response_id. + # This allows the workflow to be run multiple times without stale state issues. + researcher = Agent( + client=client, + name="researcher", + instructions=( + "You are a travel researcher. Use your tools to gather weather " + "and flight information for the destination the user asks about." + ), + tools=[get_weather, get_flight_price], + default_options={"store": False}, + ) + + planner = Agent( + client=client, + name="planner", + instructions=( + "You are a travel planner. Based on the research provided, " + "create a concise travel recommendation with packing tips." + ), + default_options={"store": False}, + ) + + # 3. Build a sequential workflow: researcher → planner + workflow = SequentialBuilder(participants=[researcher, planner]).build() + + # 4. Create the evaluator — provider config goes here, once + evals = FoundryEvals(client=client, model=deployment) + + # ========================================================================= + # Pattern 1: Post-hoc — evaluate a workflow run you already did + # ========================================================================= + print("=" * 60) + print("Pattern 1: Post-hoc workflow evaluation") + print("=" * 60) + + result = await workflow.run("Plan a trip from Seattle to Paris") + + eval_results = await evaluate_workflow( + workflow=workflow, + workflow_result=result, + evaluators=evals, + ) + + for r in eval_results: + print(f"\nOverall: {r.status}") + print(f" Passed: {r.passed}/{r.total}") + print(f" Portal: {r.report_url}") + + print("\nPer-agent breakdown:") + for agent_name, agent_eval in r.sub_results.items(): + print(f" {agent_name}: {agent_eval.passed}/{agent_eval.total} passed") + if agent_eval.report_url: + print(f" Portal: {agent_eval.report_url}") + + # ========================================================================= + # Pattern 2: Run + evaluate with multiple queries + # ========================================================================= + # Build a fresh workflow to avoid stale session state from Pattern 1. + # The Responses API tracks previous_response_id per session, so reusing + # a workflow after a run would reference stale tool calls. + workflow2 = SequentialBuilder(participants=[researcher, planner]).build() + + print() + print("=" * 60) + print("Pattern 2: Run + evaluate with multiple queries") + print("=" * 60) + + eval_results = await evaluate_workflow( + workflow=workflow2, + queries=[ + "Plan a trip from London to Tokyo", + "Plan a trip from New York to Rome", + ], + evaluators=FoundryEvals( + client=client, + model=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TASK_ADHERENCE], + ), + ) + + for r in eval_results: + print(f"\nOverall: {r.status}") + print(f" Passed: {r.passed}/{r.total}") + if r.report_url: + print(f" Portal: {r.report_url}") + + print("\nPer-agent breakdown:") + for agent_name, agent_eval in r.sub_results.items(): + print(f" {agent_name}: {agent_eval.passed}/{agent_eval.total} passed") + if agent_eval.report_url: + print(f" Portal: {agent_eval.report_url}") + + +if __name__ == "__main__": + asyncio.run(main()) + + +""" +Sample output (with actual Azure AI Foundry project): + +============================================================ +Pattern 1: Post-hoc workflow evaluation +============================================================ + +Overall: completed + Passed: 2/2 + Portal: https://ai.azure.com/... + +Per-agent breakdown: + researcher: 1/1 passed + planner: 1/1 passed + +============================================================ +Pattern 2: Run + evaluate with multiple queries +============================================================ + +Overall: completed + Passed: 4/4 + +Per-agent breakdown: + researcher: 2/2 passed + planner: 2/2 passed +""" diff --git a/python/samples/05-end-to-end/evaluation/self_reflection/.env.example b/python/samples/05-end-to-end/evaluation/self_reflection/.env.example index 413a62c0ff..8c24539c3c 100644 --- a/python/samples/05-end-to-end/evaluation/self_reflection/.env.example +++ b/python/samples/05-end-to-end/evaluation/self_reflection/.env.example @@ -1,3 +1 @@ -AZURE_OPENAI_ENDPOINT="..." -AZURE_OPENAI_API_KEY="..." -AZURE_AI_PROJECT_ENDPOINT="https://.services.ai.azure.com/api/projects//" +FOUNDRY_PROJECT_ENDPOINT=https://.services.ai.azure.com diff --git a/python/samples/05-end-to-end/evaluation/self_reflection/README.md b/python/samples/05-end-to-end/evaluation/self_reflection/README.md index 5c26f352e7..0591f37f73 100644 --- a/python/samples/05-end-to-end/evaluation/self_reflection/README.md +++ b/python/samples/05-end-to-end/evaluation/self_reflection/README.md @@ -6,31 +6,27 @@ This sample demonstrates the self-reflection pattern using Agent Framework and A **What it demonstrates:** - Iterative self-reflection loop that automatically improves responses based on groundedness evaluation +- Using `FoundryEvals` to score each iteration via the Foundry Groundedness evaluator - Batch processing of prompts from JSONL files with progress tracking -- Using `AzureOpenAIResponsesClient` with a Project Endpoint and Azure CLI authentication +- Using `FoundryChatClient` with a Project Endpoint and Azure CLI authentication - Comprehensive summary statistics and detailed result tracking ## Prerequisites ### Azure Resources -- **Azure OpenAI Responses in Foundry**: Deploy models (default: gpt-5.2 for both agent and judge) +- **Azure AI Foundry project**: Deploy models (default: gpt-5.2 for both agent and judge) - **Azure CLI**: Run `az login` to authenticate -### Python Environment -```bash -pip install agent-framework-core pandas --pre -``` - ### Environment Variables ```bash -AZURE_AI_PROJECT_ENDPOINT=https://.services.ai.azure.com/api/projects// +FOUNDRY_PROJECT_ENDPOINT=https://.services.ai.azure.com ``` ## Running the Sample ```bash # Basic usage -python self_reflection.py +uv run python samples/05-end-to-end/evaluation/self_reflection/self_reflection.py # With options python self_reflection.py --input my_prompts.jsonl \ @@ -42,8 +38,8 @@ python self_reflection.py --input my_prompts.jsonl \ **CLI Options:** - `--input`, `-i`: Input JSONL file - `--output`, `-o`: Output JSONL file -- `--agent-model`, `-m`: Agent model name (default: gpt-4.1) -- `--judge-model`, `-e`: Evaluator model name (default: gpt-4.1) +- `--agent-model`, `-m`: Agent model name (default: gpt-5.2) +- `--judge-model`, `-e`: Evaluator model name (default: gpt-5.2) - `--max-reflections`: Max iterations (default: 3) - `--limit`, `-n`: Process only first N prompts @@ -51,7 +47,7 @@ python self_reflection.py --input my_prompts.jsonl \ The agent iteratively improves responses: 1. Generate initial response -2. Evaluate groundedness (1-5 scale) +2. Evaluate groundedness via `FoundryEvals` (1-5 scale) 3. If score < 5, provide feedback and retry 4. Stop at max iterations or perfect score (5/5) @@ -70,7 +66,7 @@ In the Foundry UI, under `Build`/`Evaluations` you can view detailed results for - Context - Query - Response -- Groundedness scores and reasoning for each interation of each prompt +- Groundedness scores and reasoning for each iteration of each prompt ## Related Resources diff --git a/python/samples/05-end-to-end/evaluation/self_reflection/self_reflection.py b/python/samples/05-end-to-end/evaluation/self_reflection/self_reflection.py index 8251e89e72..238221de48 100644 --- a/python/samples/05-end-to-end/evaluation/self_reflection/self_reflection.py +++ b/python/samples/05-end-to-end/evaluation/self_reflection/self_reflection.py @@ -17,26 +17,20 @@ from pathlib import Path from typing import Any -import openai import pandas as pd -from agent_framework import Agent, Message -from agent_framework.foundry import FoundryChatClient -from azure.ai.projects import AIProjectClient -from azure.identity import AzureCliCredential +from agent_framework import Agent, EvalItem, Message +from agent_framework.foundry import FoundryChatClient, FoundryEvals +from azure.identity.aio import AzureCliCredential as AsyncAzureCliCredential from dotenv import load_dotenv -from openai.types.eval_create_params import DataSourceConfigCustom -from openai.types.evals.create_eval_jsonl_run_data_source_param import ( - CreateEvalJSONLRunDataSourceParam, - SourceFileContent, - SourceFileContentContent, -) """ Self-Reflection LLM Runner Reflexion: language agents with verbal reinforcement learning. Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao. 2023. -In Proceedings of the 37th International Conference on Neural Information Processing Systems (NIPS '23). Curran Associates Inc., Red Hook, NY, USA, Article 377, 8634–8652. +In Proceedings of the 37th International Conference on Neural Information +Processing Systems (NIPS '23). Curran Associates Inc., Red Hook, NY, USA, +Article 377, 8634–8652. https://arxiv.org/abs/2303.11366 This module implements a self-reflection loop for LLM responses using groundedness evaluation. @@ -86,104 +80,37 @@ DEFAULT_JUDGE_MODEL = "gpt-5.2" -def create_openai_client(): - endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] - credential = AzureCliCredential() - project_client = AIProjectClient(endpoint=endpoint, credential=credential) - return project_client.get_openai_client() - - -def create_async_project_client(): - from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient - from azure.identity.aio import AzureCliCredential as AsyncAzureCliCredential - - return AsyncAIProjectClient(endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], credential=AsyncAzureCliCredential()) - - -def create_eval(client: openai.OpenAI, judge_model: str) -> openai.types.EvalCreateResponse: - print("Creating Eval") - data_source_config = DataSourceConfigCustom({ - "type": "custom", - "item_schema": { - "type": "object", - "properties": { - "query": {"type": "string"}, - "response": {"type": "string"}, - "context": {"type": "string"}, - }, - "required": [], - }, - "include_sample_schema": True, - }) - - testing_criteria = [ - { - "type": "azure_ai_evaluator", - "name": "groundedness", - "evaluator_name": "builtin.groundedness", - "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}", "context": "{{item.context}}"}, - "initialization_parameters": {"deployment_name": f"{judge_model}"}, - } - ] - - return client.evals.create( - name="Eval", - data_source_config=data_source_config, - testing_criteria=testing_criteria, # type: ignore - ) - - -def run_eval( - client: openai.OpenAI, - eval_object: openai.types.EvalCreateResponse, +async def evaluate_groundedness( + evals: FoundryEvals, query: str, response: str, context: str, -): - eval_run_object = client.evals.runs.create( - eval_id=eval_object.id, - name="inline_data_run", - metadata={"team": "eval-exp", "scenario": "inline-data-v1"}, - data_source=CreateEvalJSONLRunDataSourceParam( - type="jsonl", - source=SourceFileContent( - type="file_content", - content=[ - SourceFileContentContent( - item={ - "query": query, - "context": context, - "response": response, - } - ), - ], - ), - ), +) -> float | None: + """Run a single groundedness evaluation and return the score.""" + item = EvalItem( + conversation=[ + Message("user", [query]), + Message("assistant", [response]), + ], + context=context, ) - - eval_run_response = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) - - MAX_RETRY = 10 - for _ in range(0, MAX_RETRY): - run = client.evals.runs.retrieve(run_id=eval_run_response.id, eval_id=eval_object.id) - if run.status == "failed": - print( - f"Eval run failed. Run ID: {run.id}, Status: {run.status}, Error: {getattr(run, 'error', 'Unknown error')}" - ) - continue - if run.status == "completed": - return list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) - time.sleep(5) - - print("Eval result retrieval timeout.") + results = await evals.evaluate( + [item], + eval_name="Self-Reflection Groundedness", + ) + if results.status != "completed" or not results.items: + return None + # Return the first evaluator score + for score in results.items[0].scores: + if score.score is not None: + return float(score.score) return None async def execute_query_with_self_reflection( *, - client: openai.OpenAI, + evals: FoundryEvals, agent: Agent, - eval_object: openai.types.EvalCreateResponse, full_user_query: str, context: str, max_self_reflections: int = 3, @@ -192,10 +119,10 @@ async def execute_query_with_self_reflection( Execute a query with self-reflection loop. Args: + evals: FoundryEvals instance for groundedness scoring agent: Agent instance to use for generating responses full_user_query: Complete prompt including system prompt, user request, and context context: Context document for groundedness evaluation - evaluator: Groundedness evaluator function max_self_reflections: Maximum number of self-reflection iterations Returns: @@ -205,7 +132,6 @@ async def execute_query_with_self_reflection( - best_iteration: Iteration number where best score was achieved - iteration_scores: List of groundedness scores for each iteration - messages: Full conversation history - - usage_metadata: Token usage information - num_retries: Number of iterations performed - total_groundedness_eval_time: Time spent on evaluations (seconds) - total_end_to_end_time: Total execution time (seconds) @@ -219,7 +145,7 @@ async def execute_query_with_self_reflection( raw_response = None total_groundedness_eval_time = 0.0 start_time = time.time() - iteration_scores = [] # Store all iteration scores in structured format + iteration_scores = [] for i in range(max_self_reflections): print(f" Self-reflection iteration {i + 1}/{max_self_reflections}...") @@ -227,22 +153,16 @@ async def execute_query_with_self_reflection( raw_response = await agent.run(messages=messages) agent_response = raw_response.text - # Evaluate groundedness + # Evaluate groundedness using FoundryEvals start_time_eval = time.time() - eval_run_output_items = run_eval( - client=client, - eval_object=eval_object, - query=full_user_query, - response=agent_response, - context=context, - ) - if eval_run_output_items is None: - print(f" ⚠️ Groundedness evaluation failed (timeout or error) for iteration {i + 1}.") - continue - score = eval_run_output_items[0].results[0].score + score = await evaluate_groundedness(evals, full_user_query, agent_response, context) end_time_eval = time.time() total_groundedness_eval_time += end_time_eval - start_time_eval + if score is None: + print(f" ⚠️ Groundedness evaluation failed for iteration {i + 1}.") + continue + # Store score in structured format iteration_scores.append(score) @@ -293,7 +213,6 @@ async def execute_query_with_self_reflection( async def run_self_reflection_batch( - project_client: AIProjectClient, input_file: str, output_file: str, agent_model: str = DEFAULT_AGENT_MODEL, @@ -301,7 +220,7 @@ async def run_self_reflection_batch( max_self_reflections: int = 3, env_file: str | None = None, limit: int | None = None, -): +) -> None: """ Run self-reflection on a batch of prompts. @@ -315,17 +234,31 @@ async def run_self_reflection_batch( limit: Optional limit to process only the first N prompts """ # Load environment variables - if env_file and os.path.exists(env_file): - load_dotenv(env_file, override=True) - else: - load_dotenv(override=True) + load_dotenv(env_file, override=True) if env_file else load_dotenv(override=True) + + from azure.ai.projects.aio import AIProjectClient as AsyncAIProjectClient + + endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] + credential = AsyncAzureCliCredential() + project_client = AsyncAIProjectClient(endpoint=endpoint, credential=credential) - # Create agent, it loads environment variables AZURE_OPENAI_API_KEY and AZURE_OPENAI_ENDPOINT automatically - responses_client = FoundryChatClient( + # Create agent client + agent_client = FoundryChatClient( project_client=project_client, model=agent_model, ) + # Create FoundryEvals for groundedness scoring + judge_client = FoundryChatClient( + project_client=project_client, + model=judge_model, + ) + evals = FoundryEvals( + client=judge_client, + model=judge_model, + evaluators=[FoundryEvals.GROUNDEDNESS], + ) + # Load input data input_path = (Path(__file__).parent / input_file).resolve() print(f"Loading prompts from: {input_path}") @@ -351,13 +284,6 @@ async def run_self_reflection_batch( if missing_columns: raise ValueError(f"Input file missing required columns: {missing_columns}") - # Configure clients - print("Configuring Azure OpenAI client...") - client = create_openai_client() - - # Create Eval - eval_object = create_eval(client=client, judge_model=judge_model) - # Process each prompt print(f"Max self-reflections: {max_self_reflections}\n") @@ -367,9 +293,8 @@ async def run_self_reflection_batch( try: result = await execute_query_with_self_reflection( - client=client, - agent=Agent(client=responses_client, instructions=row["system_instruction"]), - eval_object=eval_object, + evals=evals, + agent=Agent(client=agent_client, instructions=row["system_instruction"]), full_user_query=row["full_prompt"], context=row["context_document"], max_self_reflections=max_self_reflections, @@ -452,9 +377,8 @@ async def run_self_reflection_batch( perfect_scores = sum(1 for s in best_scores if s == 5) print("\nGroundedness Scores:") print(f" Average best score: {avg_score:.2f}/5") - print( - f" Perfect scores (5/5): {perfect_scores}/{len(best_scores)} ({100 * perfect_scores / len(best_scores):.1f}%)" - ) + pct = 100 * perfect_scores / len(best_scores) + print(f" Perfect scores (5/5): {perfect_scores}/{len(best_scores)} ({pct:.1f}%)") # Calculate improvement metrics if iteration_scores_list: @@ -472,9 +396,8 @@ async def run_self_reflection_batch( print(f" Average first score: {avg_first_score:.2f}/5") print(f" Average final score: {avg_last_score:.2f}/5") print(f" Average improvement: +{avg_improvement:.2f}") - print( - f" Responses that improved: {improved_count}/{len(improvements)} ({100 * improved_count / len(improvements):.1f}%)" - ) + pct = 100 * improved_count / len(improvements) + print(f" Responses that improved: {improved_count}/{len(improvements)} ({pct:.1f}%)") # Show iteration statistics if iterations: @@ -486,6 +409,8 @@ async def run_self_reflection_batch( print("=" * 60) + await credential.close() + async def main(): """CLI entry point.""" @@ -519,7 +444,6 @@ async def main(): # Run the batch processing try: await run_self_reflection_batch( - project_client=create_async_project_client(), input_file=args.input, output_file=args.output, agent_model=args.agent_model,