From 257eff4bfa9484730d6043787e916e52ad95a583 Mon Sep 17 00:00:00 2001 From: alliscode Date: Tue, 17 Mar 2026 14:10:54 -0700 Subject: [PATCH 1/4] Foundry Evals integration for .NET Add evaluation framework with local and Foundry-hosted evaluator support: - EvalItem/EvalCheck/EvalChecks core types with IConversationSplitter - IAgentEvaluator interface and MeaiEvaluatorAdapter for MEAI bridge - FunctionEvaluator and LocalEvaluator for custom evaluation functions - FoundryEvals provider for Azure AI Foundry hosted evaluations - EvaluateAsync extension methods with expected values support - WorkflowEvaluationExtensions for multi-agent workflow evaluation - Unit tests and evaluation samples Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- dotnet/agent-framework-dotnet.slnx | 594 +++++------ ...s_Evaluations_Step02_SelfReflection.csproj | 1 - .../Program.cs | 17 +- ...ents_Evaluations_Step03_AllPatterns.csproj | 24 + .../Program.cs | 337 +++++++ .../README.md | 49 + .../Evaluation/FoundryEvals.cs | 224 +++++ .../Microsoft.Agents.AI.AzureAI.csproj | 3 + .../WorkflowEvaluationExtensions.cs | 135 +++ .../Evaluation/AgentEvaluationExtensions.cs | 346 +++++++ .../Evaluation/AgentEvaluationResults.cs | 126 +++ .../Evaluation/CheckResult.cs | 11 + .../Evaluation/EvalCheck.cs | 10 + .../Evaluation/EvalChecks.cs | 86 ++ .../Evaluation/EvalItem.cs | 141 +++ .../Evaluation/ExpectedToolCall.cs | 20 + .../Evaluation/FunctionEvaluator.cs | 68 ++ .../Evaluation/IAgentEvaluator.cs | 33 + .../Evaluation/IConversationSplitter.cs | 103 ++ .../Evaluation/LocalEvaluator.cs | 66 ++ .../Evaluation/MeaiEvaluatorAdapter.cs | 62 ++ .../Microsoft.Agents.AI.csproj | 1 + .../EvaluationTests.cs | 935 ++++++++++++++++++ 23 files changed, 3056 insertions(+), 336 deletions(-) create mode 100644 dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj create mode 100644 dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs create mode 100644 dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md create mode 100644 dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs create mode 100644 dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/CheckResult.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/EvalCheck.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/EvalChecks.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/ExpectedToolCall.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/FunctionEvaluator.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/IAgentEvaluator.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/IConversationSplitter.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/LocalEvaluator.cs create mode 100644 dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs create mode 100644 dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs diff --git a/dotnet/agent-framework-dotnet.slnx b/dotnet/agent-framework-dotnet.slnx index a4ffe13958..6e75bd355b 100644 --- a/dotnet/agent-framework-dotnet.slnx +++ b/dotnet/agent-framework-dotnet.slnx @@ -5,213 +5,230 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -219,118 +236,60 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - @@ -339,6 +298,7 @@ + @@ -361,10 +321,6 @@ - - - - @@ -375,10 +331,6 @@ - - - - @@ -444,10 +396,6 @@ - - - - @@ -455,10 +403,6 @@ - - - - @@ -482,7 +426,6 @@ - @@ -490,11 +433,11 @@ + - @@ -508,11 +451,11 @@ - + @@ -529,20 +472,19 @@ - + - - + \ No newline at end of file diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj index 646cd75532..8b6a7d5001 100644 --- a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj @@ -9,7 +9,6 @@ - diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs index 8f8c9fa4ee..ca1e8a06ad 100644 --- a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs @@ -12,8 +12,8 @@ // For more details, see: // https://learn.microsoft.com/dotnet/ai/evaluation/libraries -using Azure.AI.OpenAI; using Azure.AI.Projects; +using Azure.AI.Projects.OpenAI; using Azure.Identity; using Microsoft.Agents.AI; using Microsoft.Extensions.AI; @@ -24,26 +24,25 @@ using ChatMessage = Microsoft.Extensions.AI.ChatMessage; using ChatRole = Microsoft.Extensions.AI.ChatRole; -string endpoint = Environment.GetEnvironmentVariable("AZURE_AI_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_AI_PROJECT_ENDPOINT is not set."); -string deploymentName = Environment.GetEnvironmentVariable("AZURE_AI_MODEL_DEPLOYMENT_NAME") ?? "gpt-4o-mini"; -string openAiEndpoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT") ?? throw new InvalidOperationException("AZURE_OPENAI_ENDPOINT is not set."); -string evaluatorDeploymentName = Environment.GetEnvironmentVariable("AZURE_OPENAI_DEPLOYMENT_NAME") ?? deploymentName; +string endpoint = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_FOUNDRY_PROJECT_ENDPOINT is not set."); +string deploymentName = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME") ?? "gpt-4o-mini"; Console.WriteLine("=" + new string('=', 79)); Console.WriteLine("SELF-REFLECTION EVALUATION SAMPLE"); Console.WriteLine("=" + new string('=', 79)); Console.WriteLine(); -// Initialize Azure credentials and client +// Initialize Azure credentials and client — everything derives from the project endpoint // WARNING: DefaultAzureCredential is convenient for development but requires careful consideration in production. // In production, consider using a specific credential (e.g., ManagedIdentityCredential) to avoid // latency issues, unintended credential probing, and potential security risks from fallback mechanisms. DefaultAzureCredential credential = new(); AIProjectClient aiProjectClient = new(new Uri(endpoint), credential); -// Set up the LLM-based chat client for quality evaluators -IChatClient chatClient = new AzureOpenAIClient(new Uri(openAiEndpoint), credential) - .GetChatClient(evaluatorDeploymentName) +// Get a chat client for LLM-based evaluators from the project client +IChatClient chatClient = aiProjectClient + .GetProjectOpenAIClient() + .GetChatClient(deploymentName) .AsIChatClient(); // Configure evaluation: quality evaluators use the LLM, safety evaluators use Azure AI Foundry diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj new file mode 100644 index 0000000000..8b6a7d5001 --- /dev/null +++ b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj @@ -0,0 +1,24 @@ + + + + Exe + net10.0 + + enable + enable + + + + + + + + + + + + + + + + diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs new file mode 100644 index 0000000000..ec6531cf85 --- /dev/null +++ b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs @@ -0,0 +1,337 @@ +// Copyright (c) Microsoft. All rights reserved. + +// This sample demonstrates all evaluation patterns available in Agent Framework for .NET. +// It covers: +// 1. Function evaluators — custom checks using lambdas +// 2. Built-in checks — keyword and tool-called validation +// 3. MEAI evaluators — LLM-based quality scoring (Relevance, Coherence, Groundedness) +// 4. Foundry evaluators — cloud-based evaluation with Azure AI Foundry +// 5. Mixed evaluators — combining local checks with cloud evaluation +// 6. Pre-existing response evaluation — evaluate responses without re-running the agent +// 7. Conversation split strategies — LastTurn, Full, PerTurn, and call-site override +// +// Mirrors the Python sample: evaluate_all_patterns_sample.py + +using Azure.AI.Projects; +using Azure.AI.Projects.OpenAI; +using Azure.Identity; +using Microsoft.Agents.AI; +using Microsoft.Agents.AI.AzureAI; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; +using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Safety; + +using ChatMessage = Microsoft.Extensions.AI.ChatMessage; +using ChatRole = Microsoft.Extensions.AI.ChatRole; +using FoundryEvals = Microsoft.Agents.AI.AzureAI.FoundryEvals; + +string endpoint = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_ENDPOINT") + ?? throw new InvalidOperationException("AZURE_FOUNDRY_PROJECT_ENDPOINT is not set."); +string deploymentName = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME") ?? "gpt-4o-mini"; + +Console.WriteLine("=" + new string('=', 79)); +Console.WriteLine("AGENT FRAMEWORK EVALUATION — ALL PATTERNS"); +Console.WriteLine("=" + new string('=', 79)); +Console.WriteLine(); + +// Initialize Azure credentials and clients — everything derives from the project endpoint +DefaultAzureCredential credential = new(); +AIProjectClient aiProjectClient = new(new Uri(endpoint), credential); + +// Get a chat client for LLM-based evaluators from the project client +IChatClient chatClient = aiProjectClient + .GetProjectOpenAIClient() + .GetChatClient(deploymentName) + .AsIChatClient(); + +ContentSafetyServiceConfiguration safetyConfig = new( + credential: credential, + endpoint: new Uri(endpoint)); + +ChatConfiguration chatConfiguration = safetyConfig.ToChatConfiguration( + originalChatConfiguration: new ChatConfiguration(chatClient)); + +// Create test agent +AIAgent agent = await aiProjectClient.CreateAIAgentAsync( + name: "WeatherAgent", + model: deploymentName, + instructions: "You are a helpful weather assistant. Answer questions about weather accurately and concisely."); + +Console.WriteLine($"Created agent: {agent.Name}"); +Console.WriteLine(); + +string[] queries = ["What's the weather in Seattle?", "Is it going to rain in New York today?"]; + +try +{ + // ================================================================ + // Section 1: Function Evaluators + // ================================================================ + Console.WriteLine("SECTION 1: Function Evaluators"); + Console.WriteLine(new string('-', 60)); + + var functionEvaluator = new LocalEvaluator( + FunctionEvaluator.Create("is_concise", + (string response) => response.Split(' ').Length < 500), + FunctionEvaluator.Create("has_content", + (string response) => response.Length > 10), + FunctionEvaluator.Create("mentions_location", + (EvalItem item) => item.Response.Contains("Seattle", StringComparison.OrdinalIgnoreCase) + || item.Response.Contains("New York", StringComparison.OrdinalIgnoreCase))); + + AgentEvaluationResults functionResults = await agent.EvaluateAsync( + queries, + functionEvaluator); + + PrintResults("Function Evaluators", functionResults); + + // ================================================================ + // Section 2: Built-in Checks + // ================================================================ + Console.WriteLine("SECTION 2: Built-in Checks"); + Console.WriteLine(new string('-', 60)); + + var builtinEvaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("weather"), + EvalChecks.KeywordCheck(caseSensitive: false, "temperature", "forecast")); + + AgentEvaluationResults builtinResults = await agent.EvaluateAsync( + queries, + builtinEvaluator); + + PrintResults("Built-in Checks", builtinResults); + + // ================================================================ + // Section 3: MEAI Quality Evaluators + // ================================================================ + Console.WriteLine("SECTION 3: MEAI Quality Evaluators"); + Console.WriteLine(new string('-', 60)); + + // Pass MEAI evaluators directly — no adapter needed + AgentEvaluationResults meaiResults = await agent.EvaluateAsync( + queries, + new CompositeEvaluator( + new RelevanceEvaluator(), + new CoherenceEvaluator()), + chatConfiguration); + + PrintResults("MEAI Quality", meaiResults); + + // Print per-metric details for MEAI results + foreach (EvaluationResult itemResult in meaiResults.Items) + { + foreach (EvaluationMetric metric in itemResult.Metrics.Values) + { + if (metric is NumericMetric n) + { + string rating = n.Interpretation?.Rating.ToString() ?? "N/A"; + Console.WriteLine($" {n.Name,-20} Score: {n.Value:F1}/5 Rating: {rating}"); + } + } + } + + Console.WriteLine(); + + // ================================================================ + // Section 4: Foundry Evaluators (Cloud-based) + // ================================================================ + Console.WriteLine("SECTION 4: Foundry Evaluators"); + Console.WriteLine(new string('-', 60)); + + var foundryEvaluator = new FoundryEvals( + chatConfiguration, + FoundryEvals.Relevance, + FoundryEvals.Coherence, + FoundryEvals.Groundedness); + + AgentEvaluationResults foundryResults = await agent.EvaluateAsync( + queries, + foundryEvaluator); + + PrintResults("Foundry Evaluators", foundryResults); + + // ================================================================ + // Section 5: Mixed Evaluators (Local + Cloud) + // ================================================================ + Console.WriteLine("SECTION 5: Mixed Evaluators"); + Console.WriteLine(new string('-', 60)); + + IReadOnlyList mixedResults = await agent.EvaluateAsync( + queries, + evaluators: new IAgentEvaluator[] + { + new LocalEvaluator( + EvalChecks.KeywordCheck("weather"), + FunctionEvaluator.Create("not_empty", (string r) => r.Length > 0)), + new FoundryEvals(chatConfiguration, FoundryEvals.Relevance), + }); + + foreach (AgentEvaluationResults result in mixedResults) + { + PrintResults($"Mixed - {result.Provider}", result); + } + + // ================================================================ + // Section 6: Evaluate Pre-existing Responses + // ================================================================ + Console.WriteLine("SECTION 6: Evaluate Pre-existing Responses"); + Console.WriteLine(new string('-', 60)); + + // Get responses first + var savedQueries = new List(); + var savedResponses = new List(); + foreach (string query in queries) + { + AgentResponse response = await agent.RunAsync( + new List { new(ChatRole.User, query) }); + savedQueries.Add(query); + savedResponses.Add(response); + } + + // Evaluate the saved responses without re-running the agent + AgentEvaluationResults preExistingResults = await agent.EvaluateAsync( + savedResponses, + savedQueries, + new LocalEvaluator( + EvalChecks.KeywordCheck("weather"), + FunctionEvaluator.Create("response_quality", + (EvalItem item) => new EvalCheckResult( + item.Response.Length > 20, + item.Response.Length > 20 + ? "Response is detailed enough" + : "Response is too short", + "response_quality")))); + + PrintResults("Pre-existing Responses", preExistingResults); + + // ================================================================ + // Section 7: Conversation Split Strategies + // ================================================================ + Console.WriteLine("SECTION 7: Conversation Split Strategies"); + Console.WriteLine(new string('-', 60)); + + // Build a multi-turn conversation manually + var multiTurnConversation = new List + { + new(ChatRole.User, "What's the weather in Seattle?"), + new(ChatRole.Assistant, "Seattle is 62°F, cloudy with a chance of rain."), + new(ChatRole.User, "And Paris?"), + new(ChatRole.Assistant, "Paris is 68°F, partly sunny."), + new(ChatRole.User, "Compare them."), + new(ChatRole.Assistant, "Seattle is cooler at 62°F with rain likely, while Paris is warmer at 68°F and sunnier."), + }; + + // Strategy 1: LAST_TURN (default) — evaluates the final response + var lastTurnItem = new EvalItem( + "Compare them.", + "Seattle is cooler at 62°F with rain likely, while Paris is warmer at 68°F and sunnier.", + multiTurnConversation); + + var (lastQuery, lastResponse) = lastTurnItem.Split(ConversationSplitters.LastTurn); + Console.WriteLine($" LastTurn split: {lastQuery.Count} query msgs, {lastResponse.Count} response msgs"); + + // Strategy 2: FULL — evaluates the whole conversation trajectory + var fullItem = new EvalItem( + "What's the weather in Seattle?", + "Full conversation trajectory", + multiTurnConversation) + { + Splitter = ConversationSplitters.Full, + }; + + var (fullQuery, fullResponse) = fullItem.Split(); + Console.WriteLine($" Full split: {fullQuery.Count} query msgs, {fullResponse.Count} response msgs"); + + // Strategy 3: PER_TURN — one eval item per user turn + var perTurnItems = EvalItem.PerTurnItems(multiTurnConversation); + Console.WriteLine($" PerTurn split: {perTurnItems.Count} items from {multiTurnConversation.Count} messages"); + + foreach (var turnItem in perTurnItems) + { + Console.WriteLine($" Turn: \"{turnItem.Query}\" → {turnItem.Response.Length} chars"); + } + + // Evaluate per-turn items with a local evaluator + var splitEvaluator = new LocalEvaluator( + FunctionEvaluator.Create("has_response", (string r) => r.Length > 5)); + + AgentEvaluationResults perTurnResults = await splitEvaluator.EvaluateAsync( + perTurnItems.ToList()); + + PrintResults("Per-Turn Evaluation", perTurnResults); + + // Strategy 4: Call-site override with built-in splitter + AgentEvaluationResults fullSplitResults = await agent.EvaluateAsync( + queries, + new LocalEvaluator(EvalChecks.KeywordCheck("weather")), + splitter: ConversationSplitters.Full); + + PrintResults("Call-site Full Split", fullSplitResults); + + // Strategy 5: Custom splitter as call-site override + // Same parameter works for built-in and custom splitters + AgentEvaluationResults customSplitResults = await agent.EvaluateAsync( + queries, + new LocalEvaluator(EvalChecks.KeywordCheck("weather")), + splitter: new WeatherToolSplitter()); + + PrintResults("Custom Splitter Override", customSplitResults); + Console.WriteLine(); +} +finally +{ + // Cleanup + await aiProjectClient.Agents.DeleteAgentAsync(agent.Name); + Console.WriteLine("Cleanup: Agent deleted."); +} + +// ============================================================================ +// Helper Functions +// ============================================================================ + +static void PrintResults(string title, AgentEvaluationResults results) +{ + string status = results.AllPassed ? "✓ ALL PASSED" : "✗ SOME FAILED"; + Console.WriteLine($" [{title}] {status} ({results.Passed}/{results.Total})"); + + if (results.SubResults is not null) + { + foreach (var (agentId, sub) in results.SubResults) + { + string subStatus = sub.AllPassed ? "✓" : "✗"; + Console.WriteLine($" {subStatus} {agentId}: {sub.Passed}/{sub.Total}"); + } + } + + Console.WriteLine(); +} + +// ============================================================================ +// Custom Splitter — demonstrates IConversationSplitter +// ============================================================================ + +/// +/// Example custom splitter that splits before the first tool call. +/// Evaluates whether the agent's tool usage and final response are appropriate. +/// +sealed class WeatherToolSplitter : IConversationSplitter +{ + public (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split( + IReadOnlyList conversation) + { + for (int i = 0; i < conversation.Count; i++) + { + if (conversation[i].Role == ChatRole.Assistant + && conversation[i].Contents.OfType().Any()) + { + return ( + conversation.Take(i).ToList(), + conversation.Skip(i).ToList()); + } + } + + // Fallback: use the default LastTurn split + return ConversationSplitters.LastTurn.Split(conversation); + } +} \ No newline at end of file diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md new file mode 100644 index 0000000000..d7b598a771 --- /dev/null +++ b/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md @@ -0,0 +1,49 @@ +# Evaluation — All Patterns + +This sample demonstrates all evaluation patterns available in Agent Framework for .NET: + +| Section | Pattern | Description | +|---------|---------|-------------| +| 1 | **Function Evaluators** | Custom checks using C# lambdas via `FunctionEvaluator.Create()` | +| 2 | **Built-in Checks** | `EvalChecks.KeywordCheck()` and `EvalChecks.ToolCalledCheck()` | +| 3 | **MEAI Quality Evaluators** | LLM-based scoring with `RelevanceEvaluator`, `CoherenceEvaluator` | +| 4 | **Foundry Evaluators** | Cloud-based evaluation via `FoundryEvals` | +| 5 | **Mixed Evaluators** | Combining local checks with cloud evaluation in one call | +| 6 | **Pre-existing Responses** | Evaluate saved responses without re-running the agent | + +## Prerequisites + +- Azure AI Foundry project with a deployed model +- Set environment variables: + - `AZURE_FOUNDRY_PROJECT_ENDPOINT` — Your Azure AI Foundry project endpoint + - `AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME` — Model deployment name (default: `gpt-4o-mini`) + +## Key Types + +```csharp +// Custom function evaluators +var check = FunctionEvaluator.Create("name", (string response) => response.Length > 10); + +// Built-in checks +var keyword = EvalChecks.KeywordCheck("expected", "keywords"); +var toolCheck = EvalChecks.ToolCalledCheck("tool_name"); + +// Local evaluator runs checks without API calls +var local = new LocalEvaluator(check, keyword, toolCheck); + +// MEAI evaluators work directly — no adapter needed +var results = await agent.EvaluateAsync(queries, new RelevanceEvaluator(), chatConfig); + +// Foundry evaluator uses Azure AI Foundry cloud evaluation +var foundry = new FoundryEvals(chatConfig, Evaluators.Relevance, Evaluators.Coherence); + +// Evaluate an agent +AgentEvaluationResults localResults = await agent.EvaluateAsync(queries, local); +localResults.AssertAllPassed(); +``` + +## Running + +```bash +dotnet run --project FoundryAgents_Evaluations_Step03_AllPatterns.csproj +``` diff --git a/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs b/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs new file mode 100644 index 0000000000..6e650683d4 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs @@ -0,0 +1,224 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Linq; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; +using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Safety; + +namespace Microsoft.Agents.AI.AzureAI; + +/// +/// Azure AI Foundry evaluator provider with built-in evaluator name constants. +/// +/// +/// +/// Combines evaluator constants (e.g., , ) +/// with the implementation that maps them to MEAI evaluators. +/// +/// +/// When the Azure.AI.Projects .NET SDK adds native evaluation API support, this class +/// will be updated to use it for full parity with the Python FoundryEvals class. +/// +/// +public sealed class FoundryEvals : IAgentEvaluator +{ + private readonly ChatConfiguration _chatConfiguration; + private readonly string[] _evaluatorNames; + private readonly IConversationSplitter? _splitter; + + // ----------------------------------------------------------------------- + // Constructors + // ----------------------------------------------------------------------- + + /// + /// Initializes a new instance of the class. + /// + /// Chat configuration for the LLM-based evaluators. + /// + /// Names of evaluators to use (e.g., , ). + /// When empty, defaults to relevance and coherence. + /// + public FoundryEvals(ChatConfiguration chatConfiguration, params string[] evaluators) + : this(chatConfiguration, splitter: null, evaluators) + { + } + + /// + /// Initializes a new instance of the class with a default splitter. + /// + /// Chat configuration for the LLM-based evaluators. + /// + /// Default conversation splitter for multi-turn conversations. Overridden by + /// when set on individual items. + /// Use , , + /// or a custom implementation. + /// + /// + /// Names of evaluators to use (e.g., , ). + /// When empty, defaults to relevance and coherence. + /// + public FoundryEvals(ChatConfiguration chatConfiguration, IConversationSplitter? splitter, params string[] evaluators) + { + this._chatConfiguration = chatConfiguration; + this._splitter = splitter; + this._evaluatorNames = evaluators.Length > 0 + ? evaluators + : [Relevance, Coherence]; + } + + // ----------------------------------------------------------------------- + // IAgentEvaluator + // ----------------------------------------------------------------------- + + /// + public string Name => "FoundryEvals"; + + /// + public async Task EvaluateAsync( + IReadOnlyList items, + string evalName = "Foundry Eval", + CancellationToken cancellationToken = default) + { + var meaiEvaluators = BuildEvaluators(this._evaluatorNames); + var composite = new CompositeEvaluator(meaiEvaluators.ToArray()); + + var results = new List(items.Count); + + foreach (var item in items) + { + cancellationToken.ThrowIfCancellationRequested(); + + // Resolve splitter: item-level > evaluator-level > LastTurn default + var effectiveSplitter = item.Splitter ?? this._splitter; + var (queryMessages, _) = item.Split(effectiveSplitter); + var messages = queryMessages.ToList(); + + var chatResponse = item.RawResponse + ?? new ChatResponse(new ChatMessage(ChatRole.Assistant, item.Response)); + + var additionalContext = new List(); + + if (item.Context is not null) + { + additionalContext.Add(new GroundednessEvaluatorContext(item.Context)); + } + + var result = await composite.EvaluateAsync( + messages, + chatResponse, + this._chatConfiguration, + additionalContext: additionalContext.Count > 0 ? additionalContext : null, + cancellationToken: cancellationToken).ConfigureAwait(false); + + results.Add(result); + } + + return new AgentEvaluationResults(this.Name, results); + } + + // ----------------------------------------------------------------------- + // Evaluator name constants + // ----------------------------------------------------------------------- + + // Agent behavior + + /// Evaluates whether the agent correctly resolves user intent. + public const string IntentResolution = "intent_resolution"; + + /// Evaluates whether the agent adheres to its task instructions. + public const string TaskAdherence = "task_adherence"; + + /// Evaluates whether the agent completes the requested task. + public const string TaskCompletion = "task_completion"; + + /// Evaluates the efficiency of the agent's navigation to complete the task. + public const string TaskNavigationEfficiency = "task_navigation_efficiency"; + + // Tool usage + + /// Evaluates the accuracy of tool calls made by the agent. + public const string ToolCallAccuracy = "tool_call_accuracy"; + + /// Evaluates whether the agent selects the correct tools. + public const string ToolSelection = "tool_selection"; + + /// Evaluates the accuracy of inputs provided to tools. + public const string ToolInputAccuracy = "tool_input_accuracy"; + + /// Evaluates how well the agent uses tool outputs. + public const string ToolOutputUtilization = "tool_output_utilization"; + + /// Evaluates whether tool calls succeed. + public const string ToolCallSuccess = "tool_call_success"; + + // Quality + + /// Evaluates the coherence of the response. + public const string Coherence = "coherence"; + + /// Evaluates the fluency of the response. + public const string Fluency = "fluency"; + + /// Evaluates the relevance of the response to the query. + public const string Relevance = "relevance"; + + /// Evaluates whether the response is grounded in the provided context. + public const string Groundedness = "groundedness"; + + /// Evaluates the completeness of the response. + public const string ResponseCompleteness = "response_completeness"; + + /// Evaluates the similarity between the response and the expected output. + public const string Similarity = "similarity"; + + // Safety + + /// Evaluates the response for violent content. + public const string Violence = "violence"; + + /// Evaluates the response for sexual content. + public const string Sexual = "sexual"; + + /// Evaluates the response for self-harm content. + public const string SelfHarm = "self_harm"; + + /// Evaluates the response for hate or unfairness. + public const string HateUnfairness = "hate_unfairness"; + + // ----------------------------------------------------------------------- + // Internal helpers + // ----------------------------------------------------------------------- + + private static List BuildEvaluators(string[] names) + { + var evaluators = new List(); + + foreach (var name in names) + { + var evaluator = name switch + { + Relevance => new RelevanceEvaluator(), + Coherence => new CoherenceEvaluator(), + Groundedness => new GroundednessEvaluator(), + Fluency => (IEvaluator)new FluencyEvaluator(), + + // Safety evaluators + Violence or + Sexual or + SelfHarm or + HateUnfairness => new ContentHarmEvaluator(), + + // Agent evaluators not yet available in MEAI — log warning and skip + _ => null, + }; + + if (evaluator is not null) + { + evaluators.Add(evaluator); + } + } + + return evaluators; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj b/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj index 0cd8690126..2488a20519 100644 --- a/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj +++ b/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj @@ -16,6 +16,9 @@ + + + diff --git a/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs b/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs new file mode 100644 index 0000000000..d404b182bc --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs @@ -0,0 +1,135 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI.Workflows; + +/// +/// Extension methods for evaluating workflow runs. +/// +public static class WorkflowEvaluationExtensions +{ + /// + /// Evaluates a completed workflow run. + /// + /// The completed workflow run. + /// The evaluator to score results. + /// Whether to include an overall evaluation. + /// Whether to include per-agent breakdowns. + /// Display name for this evaluation run. + /// + /// Optional conversation splitter to apply to all items. + /// Use , , + /// or a custom implementation. + /// + /// Cancellation token. + /// Evaluation results with optional per-agent sub-results. + public static async Task EvaluateAsync( + this Run run, + IAgentEvaluator evaluator, + bool includeOverall = true, + bool includePerAgent = true, + string evalName = "Workflow Eval", + IConversationSplitter? splitter = null, + CancellationToken cancellationToken = default) + { + var events = run.OutgoingEvents.ToList(); + + // Extract per-agent data + var agentData = ExtractAgentData(events, splitter); + + // Build overall items from final output + var overallItems = new List(); + if (includeOverall) + { + var finalResponse = events.OfType().LastOrDefault(); + if (finalResponse is not null) + { + var firstInvoked = events.OfType().FirstOrDefault(); + var query = firstInvoked?.Data?.ToString() ?? string.Empty; + var conversation = new List + { + new(ChatRole.User, query), + new(ChatRole.Assistant, finalResponse.Response.Text), + }; + + overallItems.Add(new EvalItem(query, finalResponse.Response.Text, conversation) + { + Splitter = splitter, + }); + } + } + + // Evaluate overall + var overallResult = overallItems.Count > 0 + ? await evaluator.EvaluateAsync(overallItems, evalName, cancellationToken).ConfigureAwait(false) + : new AgentEvaluationResults(evaluator.Name, Array.Empty()); + + // Per-agent breakdown + if (includePerAgent && agentData.Count > 0) + { + var subResults = new Dictionary(); + + foreach (var kvp in agentData) + { + subResults[kvp.Key] = await evaluator.EvaluateAsync( + kvp.Value, + $"{evalName} - {kvp.Key}", + cancellationToken).ConfigureAwait(false); + } + + overallResult.SubResults = subResults; + } + + return overallResult; + } + + private static Dictionary> ExtractAgentData( + List events, + IConversationSplitter? splitter) + { + var invoked = new Dictionary(); + var agentData = new Dictionary>(); + + foreach (var evt in events) + { + if (evt is ExecutorInvokedEvent invokedEvent) + { + invoked[invokedEvent.ExecutorId] = invokedEvent; + } + else if (evt is ExecutorCompletedEvent completedEvent + && invoked.TryGetValue(completedEvent.ExecutorId, out var matchingInvoked)) + { + var query = matchingInvoked.Data?.ToString() ?? string.Empty; + var responseText = completedEvent.Data?.ToString() ?? string.Empty; + var conversation = new List + { + new(ChatRole.User, query), + new(ChatRole.Assistant, responseText), + }; + + var item = new EvalItem(query, responseText, conversation) + { + Splitter = splitter, + }; + + if (!agentData.TryGetValue(completedEvent.ExecutorId, out var items)) + { + items = new List(); + agentData[completedEvent.ExecutorId] = items; + } + + items.Add(item); + invoked.Remove(completedEvent.ExecutorId); + } + } + + return agentData; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs new file mode 100644 index 0000000000..cfb179ab98 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs @@ -0,0 +1,346 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI; + +/// +/// Extension methods for evaluating agents, responses, and workflow runs. +/// +public static partial class AgentEvaluationExtensions +{ + /// + /// Evaluates an agent by running it against test queries and scoring the responses. + /// + /// The agent to evaluate. + /// Test queries to send to the agent. + /// The evaluator to score responses. + /// Display name for this evaluation run. + /// + /// Optional ground-truth expected outputs, one per query. When provided, + /// must be the same length as . Each value is + /// stamped on the corresponding . + /// + /// + /// Optional expected tool calls, one list per query. When provided, + /// must be the same length as . Each list is + /// stamped on the corresponding . + /// + /// + /// Optional conversation splitter to apply to all items. + /// Use , , + /// or a custom implementation. + /// + /// + /// Number of times to run each query (default 1). When greater than 1, each query is invoked + /// independently N times to measure consistency. Results contain all N × queries.Count items. + /// + /// Cancellation token. + /// Evaluation results. + public static async Task EvaluateAsync( + this AIAgent agent, + IEnumerable queries, + IAgentEvaluator evaluator, + string evalName = "Agent Framework Eval", + IEnumerable? expectedOutput = null, + IEnumerable>? expectedToolCalls = null, + IConversationSplitter? splitter = null, + int numRepetitions = 1, + CancellationToken cancellationToken = default) + { + var items = await RunAgentForEvalAsync(agent, queries, expectedOutput, expectedToolCalls, splitter, numRepetitions, cancellationToken).ConfigureAwait(false); + return await evaluator.EvaluateAsync(items, evalName, cancellationToken).ConfigureAwait(false); + } + + /// + /// Evaluates an agent using an MEAI evaluator directly. + /// + /// The agent to evaluate. + /// Test queries to send to the agent. + /// The MEAI evaluator (e.g., RelevanceEvaluator, CompositeEvaluator). + /// Chat configuration for the MEAI evaluator (includes the judge model). + /// Display name for this evaluation run. + /// + /// Optional ground-truth expected outputs, one per query. + /// + /// + /// Optional expected tool calls, one list per query. + /// + /// + /// Optional conversation splitter to apply to all items. + /// Use , , + /// or a custom implementation. + /// + /// Cancellation token. + /// Evaluation results. + public static async Task EvaluateAsync( + this AIAgent agent, + IEnumerable queries, + IEvaluator evaluator, + ChatConfiguration chatConfiguration, + string evalName = "Agent Framework Eval", + IEnumerable? expectedOutput = null, + IEnumerable>? expectedToolCalls = null, + IConversationSplitter? splitter = null, + int numRepetitions = 1, + CancellationToken cancellationToken = default) + { + var wrapped = new MeaiEvaluatorAdapter(evaluator, chatConfiguration); + return await agent.EvaluateAsync(queries, wrapped, evalName, expectedOutput, expectedToolCalls, splitter, numRepetitions, cancellationToken).ConfigureAwait(false); + } + + /// + /// Evaluates an agent by running it against test queries with multiple evaluators. + /// + /// The agent to evaluate. + /// Test queries to send to the agent. + /// The evaluators to score responses. + /// Display name for this evaluation run. + /// + /// Optional ground-truth expected outputs, one per query. + /// + /// + /// Optional expected tool calls, one list per query. + /// + /// + /// Optional conversation splitter to apply to all items. + /// Use , , + /// or a custom implementation. + /// + /// Cancellation token. + /// One result per evaluator. + public static async Task> EvaluateAsync( + this AIAgent agent, + IEnumerable queries, + IEnumerable evaluators, + string evalName = "Agent Framework Eval", + IEnumerable? expectedOutput = null, + IEnumerable>? expectedToolCalls = null, + IConversationSplitter? splitter = null, + int numRepetitions = 1, + CancellationToken cancellationToken = default) + { + var items = await RunAgentForEvalAsync(agent, queries, expectedOutput, expectedToolCalls, splitter, numRepetitions, cancellationToken).ConfigureAwait(false); + + var results = new List(); + foreach (var evaluator in evaluators) + { + var result = await evaluator.EvaluateAsync(items, evalName, cancellationToken).ConfigureAwait(false); + results.Add(result); + } + + return results; + } + + /// + /// Evaluates pre-existing agent responses without re-running the agent. + /// + /// The agent (used for tool definitions). + /// Pre-existing agent responses. + /// The queries that produced each response (must match count). + /// The evaluator to score responses. + /// Display name for this evaluation run. + /// + /// Optional ground-truth expected outputs, one per query. + /// + /// + /// Optional expected tool calls, one list per query. + /// + /// Cancellation token. + /// Evaluation results. + public static async Task EvaluateAsync( + this AIAgent agent, + IEnumerable responses, + IEnumerable queries, + IAgentEvaluator evaluator, + string evalName = "Agent Framework Eval", + IEnumerable? expectedOutput = null, + IEnumerable>? expectedToolCalls = null, + CancellationToken cancellationToken = default) + { + var items = BuildItemsFromResponses(agent, responses, queries, expected, expectedToolCalls); + return await evaluator.EvaluateAsync(items, evalName, cancellationToken).ConfigureAwait(false); + } + + /// + /// Evaluates pre-existing agent responses using an MEAI evaluator directly. + /// + /// The agent (used for tool definitions). + /// Pre-existing agent responses. + /// The queries that produced each response (must match count). + /// The MEAI evaluator. + /// Chat configuration for the MEAI evaluator. + /// Display name for this evaluation run. + /// + /// Optional ground-truth expected outputs, one per query. + /// + /// + /// Optional expected tool calls, one list per query. + /// + /// Cancellation token. + /// Evaluation results. + public static async Task EvaluateAsync( + this AIAgent agent, + IEnumerable responses, + IEnumerable queries, + IEvaluator evaluator, + ChatConfiguration chatConfiguration, + string evalName = "Agent Framework Eval", + IEnumerable? expectedOutput = null, + IEnumerable>? expectedToolCalls = null, + CancellationToken cancellationToken = default) + { + var wrapped = new MeaiEvaluatorAdapter(evaluator, chatConfiguration); + return await agent.EvaluateAsync(responses, queries, wrapped, evalName, expectedOutput, expectedToolCalls, cancellationToken).ConfigureAwait(false); + } + + private static List BuildItemsFromResponses( + AIAgent agent, + IEnumerable responses, + IEnumerable queries, + IEnumerable? expectedOutput, + IEnumerable>? expectedToolCalls) + { + var responseList = responses.ToList(); + var queryList = queries.ToList(); + var expectedList = expectedOutput?.ToList(); + var expectedToolCallsList = expectedToolCalls?.ToList(); + + if (responseList.Count != queryList.Count) + { + throw new ArgumentException( + $"Got {queryList.Count} queries but {responseList.Count} responses. Counts must match."); + } + + if (expectedList != null && expectedList.Count != queryList.Count) + { + throw new ArgumentException( + $"Got {queryList.Count} queries but {expectedList.Count} expectedOutput values. Counts must match."); + } + + if (expectedToolCallsList != null && expectedToolCallsList.Count != queryList.Count) + { + throw new ArgumentException( + $"Got {queryList.Count} queries but {expectedToolCallsList.Count} expectedToolCalls lists. Counts must match."); + } + + var items = new List(); + for (int i = 0; i < responseList.Count; i++) + { + var query = queryList[i]; + var response = responseList[i]; + + var messages = new List + { + new(ChatRole.User, query), + }; + messages.AddRange(response.Messages); + + var item = BuildEvalItem(query, response, messages, agent); + if (expectedList != null) + { + item.ExpectedOutput = expectedList[i]; + } + + if (expectedToolCallsList != null) + { + item.ExpectedToolCalls = expectedToolCallsList[i].ToList(); + } + + items.Add(item); + } + + return items; + } + + private static async Task> RunAgentForEvalAsync( + AIAgent agent, + IEnumerable queries, + IEnumerable? expectedOutput, + IEnumerable>? expectedToolCalls, + IConversationSplitter? splitter, + int numRepetitions, + CancellationToken cancellationToken) + { + if (numRepetitions < 1) + { + throw new ArgumentException($"numRepetitions must be >= 1, got {numRepetitions}.", nameof(numRepetitions)); + } + + var items = new List(); + var queryList = queries.ToList(); + var expectedList = expectedOutput?.ToList(); + var expectedToolCallsList = expectedToolCalls?.ToList(); + + if (expectedList != null && expectedList.Count != queryList.Count) + { + throw new ArgumentException( + $"Got {queryList.Count} queries but {expectedList.Count} expectedOutput values. Counts must match."); + } + + if (expectedToolCallsList != null && expectedToolCallsList.Count != queryList.Count) + { + throw new ArgumentException( + $"Got {queryList.Count} queries but {expectedToolCallsList.Count} expectedToolCalls lists. Counts must match."); + } + + for (int rep = 0; rep < numRepetitions; rep++) + { + for (int i = 0; i < queryList.Count; i++) + { + cancellationToken.ThrowIfCancellationRequested(); + + var query = queryList[i]; + var messages = new List + { + new(ChatRole.User, query), + }; + + var response = await agent.RunAsync(messages, cancellationToken: cancellationToken).ConfigureAwait(false); + var item = BuildEvalItem(query, response, messages, agent); + item.Splitter = splitter; + if (expectedList != null) + { + item.ExpectedOutput = expectedList[i]; + } + + if (expectedToolCallsList != null) + { + item.ExpectedToolCalls = expectedToolCallsList[i].ToList(); + } + + items.Add(item); + } + } + + return items; + } + + internal static EvalItem BuildEvalItem( + string query, + AgentResponse response, + List messages, + AIAgent agent) + { + // Add response messages to conversation + foreach (var msg in response.Messages) + { + if (!messages.Contains(msg)) + { + messages.Add(msg); + } + } + + return new EvalItem(query, response.Text, messages) + { + RawResponse = new ChatResponse(response.Messages.LastOrDefault() + ?? new ChatMessage(ChatRole.Assistant, response.Text)), + }; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs new file mode 100644 index 0000000000..6406760c49 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs @@ -0,0 +1,126 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI; + +/// +/// Aggregate evaluation results across multiple items. +/// +public sealed class AgentEvaluationResults +{ + private readonly List _items; + + /// + /// Initializes a new instance of the class. + /// + /// Name of the evaluation provider. + /// Per-item MEAI evaluation results. + /// The original eval items that were evaluated, for auditing. + public AgentEvaluationResults(string provider, IEnumerable items, IReadOnlyList? inputItems = null) + { + this.Provider = provider; + this._items = new List(items); + this.InputItems = inputItems; + } + + /// Gets the evaluation provider name. + public string Provider { get; } + + /// Gets the portal URL for viewing results (Foundry only). + public Uri? ReportUrl { get; set; } + + /// Gets the per-item MEAI evaluation results. + public IReadOnlyList Items => this._items; + + /// + /// Gets the original eval items that produced these results, for auditing. + /// Each entry corresponds positionally to InputItems[i] + /// is the query/response that produced Items[i]. + /// + public IReadOnlyList? InputItems { get; } + + /// Gets per-agent results for workflow evaluations. + public IReadOnlyDictionary? SubResults { get; set; } + + /// Gets the number of items that passed. + public int Passed => this._items.Count(ItemPassed); + + /// Gets the number of items that failed. + public int Failed => this._items.Count(i => !ItemPassed(i)); + + /// Gets the total number of items evaluated. + public int Total => this._items.Count; + + /// Gets whether all items passed. + public bool AllPassed + { + get + { + if (this.SubResults is not null) + { + return this.SubResults.Values.All(s => s.AllPassed); + } + + return this.Total > 0 && this.Failed == 0; + } + } + + /// + /// Asserts that all items passed. Throws on failure. + /// + /// Optional custom failure message. + /// Thrown when any items failed. + public void AssertAllPassed(string? message = null) + { + if (!this.AllPassed) + { + var detail = message ?? $"{this.Provider}: {this.Passed} passed, {this.Failed} failed out of {this.Total}."; + if (this.ReportUrl is not null) + { + detail += $" See {this.ReportUrl} for details."; + } + + if (this.SubResults is not null) + { + var failedAgents = this.SubResults + .Where(kvp => !kvp.Value.AllPassed) + .Select(kvp => kvp.Key); + detail += $" Failed agents: {string.Join(", ", failedAgents)}."; + } + + throw new InvalidOperationException(detail); + } + } + + private static bool ItemPassed(EvaluationResult result) + { + foreach (var metric in result.Metrics.Values) + { + if (metric.Interpretation?.Failed == true) + { + return false; + } + + if (metric is NumericMetric numeric && numeric.Value.HasValue) + { + if (numeric.Value.Value < 3.0) + { + return false; + } + } + else if (metric is BooleanMetric boolean && boolean.Value.HasValue) + { + if (!boolean.Value.Value) + { + return false; + } + } + } + + return result.Metrics.Count > 0; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/CheckResult.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/CheckResult.cs new file mode 100644 index 0000000000..46f47bb3c9 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/CheckResult.cs @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace Microsoft.Agents.AI; + +/// +/// Result of a single check on a single evaluation item. +/// +/// Whether the check passed. +/// Human-readable explanation. +/// Name of the check that produced this result. +public sealed record EvalCheckResult(bool Passed, string Reason, string CheckName); diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalCheck.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalCheck.cs new file mode 100644 index 0000000000..eae0750418 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalCheck.cs @@ -0,0 +1,10 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace Microsoft.Agents.AI; + +/// +/// Delegate for a synchronous evaluation check on a single item. +/// +/// The evaluation item. +/// The check result. +public delegate EvalCheckResult EvalCheck(EvalItem item); diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalChecks.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalChecks.cs new file mode 100644 index 0000000000..5dfa2da612 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalChecks.cs @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.AI; + +namespace Microsoft.Agents.AI; + +/// +/// Built-in check functions for common evaluation patterns. +/// +public static class EvalChecks +{ + /// + /// Creates a check that verifies the response contains all specified keywords. + /// + /// Keywords that must appear in the response. + /// An delegate. + public static EvalCheck KeywordCheck(params string[] keywords) + { + return KeywordCheck(caseSensitive: false, keywords); + } + + /// + /// Creates a check that verifies the response contains all specified keywords. + /// + /// Whether the comparison is case-sensitive. + /// Keywords that must appear in the response. + /// An delegate. + public static EvalCheck KeywordCheck(bool caseSensitive, params string[] keywords) + { + return (EvalItem item) => + { + var comparison = caseSensitive + ? StringComparison.Ordinal + : StringComparison.OrdinalIgnoreCase; + + var missing = keywords + .Where(kw => !item.Response.Contains(kw, comparison)) + .ToList(); + + var passed = missing.Count == 0; + var reason = passed + ? $"All keywords found: {string.Join(", ", keywords)}" + : $"Missing keywords: {string.Join(", ", missing)}"; + + return new EvalCheckResult(passed, reason, "keyword_check"); + }; + } + + /// + /// Creates a check that verifies specific tools were called in the conversation. + /// + /// Tool names that must appear in the conversation. + /// An delegate. + public static EvalCheck ToolCalledCheck(params string[] toolNames) + { + return (EvalItem item) => + { + var calledTools = new HashSet(StringComparer.OrdinalIgnoreCase); + + foreach (var message in item.Conversation) + { + foreach (var content in message.Contents) + { + if (content is FunctionCallContent functionCall) + { + calledTools.Add(functionCall.Name); + } + } + } + + var missing = toolNames + .Where(t => !calledTools.Contains(t)) + .ToList(); + + var passed = missing.Count == 0; + var reason = passed + ? $"All tools called: {string.Join(", ", toolNames)}" + : $"Missing tool calls: {string.Join(", ", missing)}"; + + return new EvalCheckResult(passed, reason, "tool_called_check"); + }; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs new file mode 100644 index 0000000000..31822ee172 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs @@ -0,0 +1,141 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.AI; + +namespace Microsoft.Agents.AI; + +/// +/// Provider-agnostic data for a single evaluation item. +/// +public sealed class EvalItem +{ + /// + /// Initializes a new instance of the class. + /// + /// The user query. + /// The agent response text. + /// The full conversation as list. + public EvalItem(string query, string response, IReadOnlyList conversation) + { + this.Query = query; + this.Response = response; + this.Conversation = conversation; + } + + /// Gets the user query. + public string Query { get; } + + /// Gets the agent response text. + public string Response { get; } + + /// Gets the full conversation history. + public IReadOnlyList Conversation { get; } + + /// Gets or sets the tools available to the agent. + public IReadOnlyList? Tools { get; set; } + + /// Gets or sets grounding context for evaluation. + public string? Context { get; set; } + + /// Gets or sets the expected output for ground-truth comparison. + public string? ExpectedOutput { get; set; } + + /// + /// Gets or sets the expected tool calls for tool-correctness evaluation. + /// + /// + /// Each entry describes a tool call the agent should make. The evaluator + /// decides matching semantics (ordering, extras, argument checking). + /// See . + /// + public IReadOnlyList? ExpectedToolCalls { get; set; } + + /// Gets or sets the raw chat response for MEAI evaluators. + public ChatResponse? RawResponse { get; set; } + + /// + /// Gets or sets the conversation splitter for this item. + /// + /// + /// When set by orchestration functions (e.g. EvaluateAsync(splitter: ...)), + /// this is used as the default by . + /// Priority: explicit Split(splitter) argument > + /// > . + /// + public IConversationSplitter? Splitter { get; set; } + + /// + /// Splits the conversation into query messages and response messages. + /// + /// + /// The splitter to use. When null, uses + /// if set, otherwise . + /// + /// A tuple of (query messages, response messages). + public (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split( + IConversationSplitter? splitter = null) + { + var effective = splitter ?? this.Splitter ?? ConversationSplitters.LastTurn; + return effective.Split(this.Conversation); + } + + /// + /// Splits a multi-turn conversation into one per user turn. + /// + /// + /// Each user message starts a new turn. The resulting item has cumulative context: + /// query messages contain the full conversation up to and including that user message, + /// and the response is everything up to the next user message. + /// + /// The full conversation to split. + /// Optional tools available to the agent. + /// Optional grounding context. + /// A list of eval items, one per user turn. + public static IReadOnlyList PerTurnItems( + IReadOnlyList conversation, + IReadOnlyList? tools = null, + string? context = null) + { + var items = new List(); + var userIndices = new List(); + + for (int i = 0; i < conversation.Count; i++) + { + if (conversation[i].Role == ChatRole.User) + { + userIndices.Add(i); + } + } + + for (int t = 0; t < userIndices.Count; t++) + { + int userIdx = userIndices[t]; + int nextBoundary = t + 1 < userIndices.Count + ? userIndices[t + 1] + : conversation.Count; + + var responseMessages = conversation.Skip(userIdx + 1).Take(nextBoundary - userIdx - 1).ToList(); + + var query = conversation[userIdx].Text ?? string.Empty; + var responseText = string.Join( + " ", + responseMessages + .Where(m => m.Role == ChatRole.Assistant && !string.IsNullOrEmpty(m.Text)) + .Select(m => m.Text)); + + var fullSlice = conversation.Take(nextBoundary).ToList(); + var item = new EvalItem(query, responseText, fullSlice) + { + Tools = tools, + Context = context, + }; + + items.Add(item); + } + + return items; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/ExpectedToolCall.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/ExpectedToolCall.cs new file mode 100644 index 0000000000..9b30899df4 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/ExpectedToolCall.cs @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; + +namespace Microsoft.Agents.AI; + +/// +/// A tool call that an agent is expected to make. +/// +/// +/// Used with EvaluateAsync to assert that the agent called the correct tools. +/// The evaluator decides matching semantics (order, extras, argument checking); +/// this type is pure data. +/// +/// The tool/function name (e.g. "get_weather"). +/// +/// Expected arguments. null means "don't check arguments". +/// When provided, evaluators typically do subset matching (all expected keys must be present). +/// +public record ExpectedToolCall(string Name, IReadOnlyDictionary? Arguments = null); diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/FunctionEvaluator.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/FunctionEvaluator.cs new file mode 100644 index 0000000000..a9024c7750 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/FunctionEvaluator.cs @@ -0,0 +1,68 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; + +namespace Microsoft.Agents.AI; + +/// +/// Factory for creating delegates from typed lambda functions. +/// +public static class FunctionEvaluator +{ + /// + /// Creates a check from a function that takes the response text and returns a bool. + /// + /// Check name for reporting. + /// Function that returns true if the response passes. + public static EvalCheck Create(string name, Func check) + { + return (EvalItem item) => + { + var passed = check(item.Response); + return new EvalCheckResult(passed, passed ? "Passed" : "Failed", name); + }; + } + + /// + /// Creates a check from a function that takes response and expected text. + /// + /// Check name for reporting. + /// Function that returns true if the response passes. + public static EvalCheck Create(string name, Func check) + { + return (EvalItem item) => + { + var passed = check(item.Response, item.ExpectedOutput); + return new EvalCheckResult(passed, passed ? "Passed" : "Failed", name); + }; + } + + /// + /// Creates a check from a function that takes the full . + /// + /// Check name for reporting. + /// Function that returns true if the item passes. + public static EvalCheck Create(string name, Func check) + { + return (EvalItem item) => + { + var passed = check(item); + return new EvalCheckResult(passed, passed ? "Passed" : "Failed", name); + }; + } + + /// + /// Creates a check from a function that takes the full + /// and returns a . + /// + /// Check name (used as fallback if the result has no name). + /// Function that returns a full check result. + public static EvalCheck Create(string name, Func check) + { + return (EvalItem item) => + { + var result = check(item); + return result with { CheckName = result.CheckName ?? name }; + }; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/IAgentEvaluator.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/IAgentEvaluator.cs new file mode 100644 index 0000000000..2dc84e35eb --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/IAgentEvaluator.cs @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Agents.AI; + +/// +/// Batch-oriented evaluator interface for agent evaluation. +/// +/// +/// Unlike MEAI's IEvaluator which evaluates one item at a time, +/// evaluates a batch of items. This enables +/// efficient cloud-based evaluation (e.g., Foundry) and aggregate result computation. +/// +public interface IAgentEvaluator +{ + /// Gets the evaluator name. + string Name { get; } + + /// + /// Evaluates a batch of items and returns aggregate results. + /// + /// The items to evaluate. + /// A display name for this evaluation run. + /// Cancellation token. + /// Aggregate evaluation results. + Task EvaluateAsync( + IReadOnlyList items, + string evalName = "Agent Framework Eval", + CancellationToken cancellationToken = default); +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/IConversationSplitter.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/IConversationSplitter.cs new file mode 100644 index 0000000000..f07282e4de --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/IConversationSplitter.cs @@ -0,0 +1,103 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.AI; + +namespace Microsoft.Agents.AI; + +/// +/// Strategy for splitting a conversation into query and response halves for evaluation. +/// +/// +/// Use one of the built-in splitters from or implement +/// your own for domain-specific splitting logic (e.g., splitting before a memory-retrieval +/// tool call to evaluate recall quality). +/// +public interface IConversationSplitter +{ + /// + /// Splits a conversation into query messages and response messages. + /// + /// The full conversation to split. + /// A tuple of (query messages, response messages). + (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split( + IReadOnlyList conversation); +} + +/// +/// Built-in conversation splitters for common evaluation patterns. +/// +/// +/// +/// : Evaluates whether the agent answered the latest question well. +/// : Evaluates whether the whole conversation trajectory served the original request. +/// +/// For custom splits, implement directly. +/// +public static class ConversationSplitters +{ + /// + /// Split at the last user message. Everything up to and including that message + /// is the query; everything after is the response. This is the default strategy. + /// + public static IConversationSplitter LastTurn { get; } = new LastTurnSplitter(); + + /// + /// The first user message (and any preceding system messages) is the query; + /// the entire remainder of the conversation is the response. + /// Evaluates overall conversation trajectory. + /// + public static IConversationSplitter Full { get; } = new FullSplitter(); + + private sealed class LastTurnSplitter : IConversationSplitter + { + public (IReadOnlyList, IReadOnlyList) Split( + IReadOnlyList conversation) + { + int lastUserIdx = -1; + for (int i = 0; i < conversation.Count; i++) + { + if (conversation[i].Role == ChatRole.User) + { + lastUserIdx = i; + } + } + + if (lastUserIdx >= 0) + { + return ( + conversation.Take(lastUserIdx + 1).ToList(), + conversation.Skip(lastUserIdx + 1).ToList()); + } + + return (new List(), conversation.ToList()); + } + } + + private sealed class FullSplitter : IConversationSplitter + { + public (IReadOnlyList, IReadOnlyList) Split( + IReadOnlyList conversation) + { + int firstUserIdx = -1; + for (int i = 0; i < conversation.Count; i++) + { + if (conversation[i].Role == ChatRole.User) + { + firstUserIdx = i; + break; + } + } + + if (firstUserIdx >= 0) + { + return ( + conversation.Take(firstUserIdx + 1).ToList(), + conversation.Skip(firstUserIdx + 1).ToList()); + } + + return (new List(), conversation.ToList()); + } + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/LocalEvaluator.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/LocalEvaluator.cs new file mode 100644 index 0000000000..2b664b0e3b --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/LocalEvaluator.cs @@ -0,0 +1,66 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI; + +/// +/// Evaluator that runs check functions locally without API calls. +/// +public sealed class LocalEvaluator : IAgentEvaluator +{ + private readonly EvalCheck[] _checks; + + /// + /// Initializes a new instance of the class. + /// + /// The check functions to run on each item. + public LocalEvaluator(params EvalCheck[] checks) + { + this._checks = checks; + } + + /// + public string Name => "LocalEvaluator"; + + /// + public Task EvaluateAsync( + IReadOnlyList items, + string evalName = "Local Eval", + CancellationToken cancellationToken = default) + { + var results = new List(items.Count); + + foreach (var item in items) + { + cancellationToken.ThrowIfCancellationRequested(); + + var evalResult = new EvaluationResult(); + + foreach (var check in this._checks) + { + var EvalCheckResult = check(item); + evalResult.Metrics[EvalCheckResult.CheckName] = new BooleanMetric( + EvalCheckResult.CheckName, + EvalCheckResult.Passed, + reason: EvalCheckResult.Reason) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvalCheckResult.Passed + ? EvaluationRating.Good + : EvaluationRating.Unacceptable, + Failed = !EvalCheckResult.Passed, + }, + }; + } + + results.Add(evalResult); + } + + return Task.FromResult(new AgentEvaluationResults(this.Name, results, inputItems: items)); + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs new file mode 100644 index 0000000000..df58ebcd7c --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI; + +/// +/// Adapter that wraps an MEAI into an . +/// Runs the MEAI evaluator per-item and aggregates results. +/// +internal sealed class MeaiEvaluatorAdapter : IAgentEvaluator +{ + private readonly IEvaluator _evaluator; + private readonly ChatConfiguration _chatConfiguration; + + /// + /// Initializes a new instance of the class. + /// + /// The MEAI evaluator to wrap. + /// Chat configuration for the evaluator (includes the judge model). + public MeaiEvaluatorAdapter(IEvaluator evaluator, ChatConfiguration chatConfiguration) + { + this._evaluator = evaluator; + this._chatConfiguration = chatConfiguration; + } + + /// + public string Name => this._evaluator.GetType().Name; + + /// + public async Task EvaluateAsync( + IReadOnlyList items, + string evalName = "MEAI Eval", + CancellationToken cancellationToken = default) + { + var results = new List(items.Count); + + foreach (var item in items) + { + cancellationToken.ThrowIfCancellationRequested(); + + var messages = item.Conversation.ToList(); + var chatResponse = item.RawResponse + ?? new ChatResponse(new ChatMessage(ChatRole.Assistant, item.Response)); + + var result = await this._evaluator.EvaluateAsync( + messages, + chatResponse, + this._chatConfiguration, + cancellationToken: cancellationToken).ConfigureAwait(false); + + results.Add(result); + } + + return new AgentEvaluationResults(this.Name, results); + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj b/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj index 70da404a61..1b08d064ca 100644 --- a/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj +++ b/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj @@ -23,6 +23,7 @@ + diff --git a/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs b/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs new file mode 100644 index 0000000000..e5410bfb3a --- /dev/null +++ b/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs @@ -0,0 +1,935 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI.UnitTests; + +/// +/// Tests for the evaluation types: , , +/// , and . +/// +public sealed class EvaluationTests +{ + private static EvalItem CreateItem( + string query = "What is the weather?", + string response = "The weather in Seattle is sunny and 72°F.", + IReadOnlyList? conversation = null) + { + conversation ??= new List + { + new(ChatRole.User, query), + new(ChatRole.Assistant, response), + }; + + return new EvalItem(query, response, conversation); + } + + // --------------------------------------------------------------- + // EvalItem tests + // --------------------------------------------------------------- + + [Fact] + public void EvalItem_Constructor_SetsProperties() + { + // Arrange & Act + var item = CreateItem(); + + // Assert + Assert.Equal("What is the weather?", item.Query); + Assert.Equal("The weather in Seattle is sunny and 72°F.", item.Response); + Assert.Equal(2, item.Conversation.Count); + Assert.Null(item.ExpectedOutput); + Assert.Null(item.Context); + Assert.Null(item.Tools); + } + + [Fact] + public void EvalItem_OptionalProperties_CanBeSet() + { + // Arrange & Act + var item = CreateItem(); + item.ExpectedOutput = "sunny"; + item.Context = "Weather data for Seattle"; + + // Assert + Assert.Equal("sunny", item.ExpectedOutput); + Assert.Equal("Weather data for Seattle", item.Context); + } + + // --------------------------------------------------------------- + // LocalEvaluator tests + // --------------------------------------------------------------- + + [Fact] + public async Task LocalEvaluator_WithPassingCheck_ReturnsPassedResultAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + FunctionEvaluator.Create("always_pass", (string _) => true)); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.Equal("LocalEvaluator", results.Provider); + Assert.Equal(1, results.Total); + Assert.Equal(1, results.Passed); + Assert.Equal(0, results.Failed); + Assert.True(results.AllPassed); + } + + [Fact] + public async Task LocalEvaluator_WithFailingCheck_ReturnsFailedResultAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + FunctionEvaluator.Create("always_fail", (string _) => false)); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.Equal(1, results.Total); + Assert.Equal(0, results.Passed); + Assert.Equal(1, results.Failed); + Assert.False(results.AllPassed); + } + + [Fact] + public async Task LocalEvaluator_WithMultipleChecks_AllChecksRunAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + FunctionEvaluator.Create("check1", (string _) => true), + FunctionEvaluator.Create("check2", (string _) => true)); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.Equal(1, results.Total); + Assert.True(results.AllPassed); + var itemResult = results.Items[0]; + Assert.Equal(2, itemResult.Metrics.Count); + Assert.True(itemResult.Metrics.ContainsKey("check1")); + Assert.True(itemResult.Metrics.ContainsKey("check2")); + } + + [Fact] + public async Task LocalEvaluator_WithMultipleItems_EvaluatesAllAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("weather")); + + var items = new List + { + CreateItem(response: "The weather is sunny."), + CreateItem(response: "I don't know about that topic."), + }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.Equal(2, results.Total); + Assert.Equal(1, results.Passed); + Assert.Equal(1, results.Failed); + } + + // --------------------------------------------------------------- + // FunctionEvaluator tests + // --------------------------------------------------------------- + + [Fact] + public async Task FunctionEvaluator_ResponseOnly_PassesResponseAsync() + { + // Arrange + var check = FunctionEvaluator.Create("length_check", + (string response) => response.Length > 10); + + var evaluator = new LocalEvaluator(check); + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task FunctionEvaluator_WithExpected_PassesExpectedAsync() + { + // Arrange + var check = FunctionEvaluator.Create("contains_expected", + (string response, string? expectedOutput) => + expectedOutput != null && response.Contains(expectedOutput, StringComparison.OrdinalIgnoreCase)); + + var evaluator = new LocalEvaluator(check); + var item = CreateItem(); + item.ExpectedOutput = "sunny"; + var items = new List { item }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task FunctionEvaluator_FullItem_AccessesAllFieldsAsync() + { + // Arrange + var check = FunctionEvaluator.Create("full_check", + (EvalItem item) => item.Query.Contains("weather", StringComparison.OrdinalIgnoreCase) + && item.Response.Length > 0); + + var evaluator = new LocalEvaluator(check); + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task FunctionEvaluator_WithCheckResult_ReturnsCustomReasonAsync() + { + // Arrange + var check = FunctionEvaluator.Create("custom_check", + (EvalItem item) => new EvalCheckResult(true, "Custom reason", "custom_check")); + + var evaluator = new LocalEvaluator(check); + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + var metric = results.Items[0].Get("custom_check"); + Assert.Equal("Custom reason", metric.Reason); + } + + // --------------------------------------------------------------- + // EvalChecks tests + // --------------------------------------------------------------- + + [Fact] + public async Task KeywordCheck_AllKeywordsPresent_PassesAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("weather", "sunny")); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task KeywordCheck_MissingKeyword_FailsAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("snow")); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.False(results.AllPassed); + } + + [Fact] + public async Task KeywordCheck_CaseInsensitiveByDefault_PassesAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("WEATHER", "SUNNY")); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task KeywordCheck_CaseSensitive_FailsOnWrongCaseAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck(caseSensitive: true, "WEATHER")); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.False(results.AllPassed); + } + + [Fact] + public async Task ToolCalledCheck_ToolPresent_PassesAsync() + { + // Arrange + var conversation = new List + { + new(ChatRole.User, "What is the weather?"), + new(ChatRole.Assistant, new List + { + new FunctionCallContent("call1", "get_weather", new Dictionary { ["city"] = "Seattle" }), + }), + new(ChatRole.Tool, new List + { + new FunctionResultContent("call1", "72°F and sunny"), + }), + new(ChatRole.Assistant, "The weather is sunny and 72°F."), + }; + + var item = CreateItem(conversation: conversation); + var evaluator = new LocalEvaluator( + EvalChecks.ToolCalledCheck("get_weather")); + + // Act + var results = await evaluator.EvaluateAsync(new List { item }); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task ToolCalledCheck_ToolMissing_FailsAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.ToolCalledCheck("get_weather")); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.False(results.AllPassed); + } + + // --------------------------------------------------------------- + // AgentEvaluationResults tests + // --------------------------------------------------------------- + + [Fact] + public void AgentEvaluationResults_AllPassed_WhenAllMetricsGood() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["check"] = new BooleanMetric("check", true) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Good, + Failed = false, + }, + }; + + // Act + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Assert + Assert.True(results.AllPassed); + Assert.Equal(1, results.Passed); + Assert.Equal(0, results.Failed); + } + + [Fact] + public void AgentEvaluationResults_NotAllPassed_WhenMetricFailed() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["check"] = new BooleanMetric("check", false) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Unacceptable, + Failed = true, + }, + }; + + // Act + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Assert + Assert.False(results.AllPassed); + Assert.Equal(0, results.Passed); + Assert.Equal(1, results.Failed); + } + + [Fact] + public void AssertAllPassed_ThrowsOnFailure() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["check"] = new BooleanMetric("check", false) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Unacceptable, + Failed = true, + }, + }; + + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Act & Assert + var ex = Assert.Throws(() => results.AssertAllPassed()); + Assert.Contains("0 passed", ex.Message); + Assert.Contains("1 failed", ex.Message); + } + + [Fact] + public void AssertAllPassed_DoesNotThrowOnSuccess() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["check"] = new BooleanMetric("check", true) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Good, + Failed = false, + }, + }; + + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Act & Assert (no exception) + results.AssertAllPassed(); + } + + [Fact] + public void AgentEvaluationResults_NumericMetric_HighScorePasses() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["relevance"] = new NumericMetric("relevance", 4.5); + + // Act + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public void AgentEvaluationResults_NumericMetric_LowScoreFails() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["relevance"] = new NumericMetric("relevance", 2.0); + + // Act + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Assert + Assert.False(results.AllPassed); + } + + [Fact] + public void AgentEvaluationResults_SubResults_AllPassedChecksChildren() + { + // Arrange + var passResult = new EvaluationResult(); + passResult.Metrics["check"] = new BooleanMetric("check", true) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Good, + Failed = false, + }, + }; + + var failResult = new EvaluationResult(); + failResult.Metrics["check"] = new BooleanMetric("check", false) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Unacceptable, + Failed = true, + }, + }; + + var results = new AgentEvaluationResults("test", Array.Empty()) + { + SubResults = new Dictionary + { + ["agent1"] = new("test", new[] { passResult }), + ["agent2"] = new("test", new[] { failResult }), + }, + }; + + // Assert + Assert.False(results.AllPassed); + } + + // --------------------------------------------------------------- + // Mixed evaluator tests + // --------------------------------------------------------------- + + [Fact] + public async Task LocalEvaluator_MixedChecks_ReportsCorrectCountsAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("weather"), + EvalChecks.KeywordCheck("snow"), + FunctionEvaluator.Create("is_long", (string r) => r.Length > 5)); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.Equal(1, results.Total); + + // One item with 3 checks: "weather" passes, "snow" fails, "is_long" passes + // The item has one failed metric so it should count as failed + Assert.Equal(0, results.Passed); + Assert.Equal(1, results.Failed); + } + + // --------------------------------------------------------------- + // Conversation Split tests + // --------------------------------------------------------------- + + private static List CreateMultiTurnConversation() + { + return new List + { + new(ChatRole.User, "What's the weather in Seattle?"), + new(ChatRole.Assistant, "Seattle is 62°F and cloudy."), + new(ChatRole.User, "And Paris?"), + new(ChatRole.Assistant, "Paris is 68°F and partly sunny."), + new(ChatRole.User, "Compare them."), + new(ChatRole.Assistant, "Seattle is cooler; Paris is warmer and sunnier."), + }; + } + + [Fact] + public void Split_LastTurn_SplitsAtLastUserMessage() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + var item = new EvalItem("Compare them.", "Seattle is cooler; Paris is warmer and sunnier.", conversation); + + // Act + var (query, response) = item.Split(ConversationSplitters.LastTurn); + + // Assert — query includes everything up to and including "Compare them." + Assert.Equal(5, query.Count); + Assert.Equal(ChatRole.User, query[query.Count - 1].Role); + Assert.Contains("Compare", query[query.Count - 1].Text); + + // Response is the final assistant message + Assert.Single(response); + Assert.Equal(ChatRole.Assistant, response[0].Role); + } + + [Fact] + public void Split_Full_SplitsAtFirstUserMessage() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + var item = new EvalItem("What's the weather in Seattle?", "Full trajectory", conversation); + + // Act + var (query, response) = item.Split(ConversationSplitters.Full); + + // Assert — query is just the first user message + Assert.Single(query); + Assert.Contains("Seattle", query[0].Text); + + // Response is everything after + Assert.Equal(5, response.Count); + } + + [Fact] + public void Split_Full_IncludesSystemMessagesInQuery() + { + // Arrange + var conversation = new List + { + new(ChatRole.System, "You are a weather assistant."), + new(ChatRole.User, "What's the weather?"), + new(ChatRole.Assistant, "It's sunny."), + }; + + var item = new EvalItem("What's the weather?", "It's sunny.", conversation); + + // Act + var (query, response) = item.Split(ConversationSplitters.Full); + + // Assert — system message + first user message + Assert.Equal(2, query.Count); + Assert.Equal(ChatRole.System, query[0].Role); + Assert.Equal(ChatRole.User, query[1].Role); + Assert.Single(response); + } + + [Fact] + public void Split_DefaultIsLastTurn() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + var item = new EvalItem("Compare them.", "response", conversation); + + // Act — no split specified + var (query, response) = item.Split(); + + // Assert — same as LastTurn + Assert.Equal(5, query.Count); + Assert.Single(response); + } + + [Fact] + public void Split_SplitterProperty_UsedWhenNoExplicitSplit() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + var item = new EvalItem("query", "response", conversation) + { + Splitter = ConversationSplitters.Full, + }; + + // Act — no explicit split, should use Splitter + var (query, response) = item.Split(); + + // Assert — Full split + Assert.Single(query); + Assert.Equal(5, response.Count); + } + + [Fact] + public void Split_ExplicitSplitter_OverridesSplitterProperty() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + var item = new EvalItem("query", "response", conversation) + { + Splitter = ConversationSplitters.Full, + }; + + // Act — explicit LastTurn overrides Full + var (query, response) = item.Split(ConversationSplitters.LastTurn); + + // Assert — LastTurn behavior + Assert.Equal(5, query.Count); + Assert.Single(response); + } + + [Fact] + public void Split_WithToolMessages_PreservesToolPairs() + { + // Arrange + var conversation = new List + { + new(ChatRole.User, "What's the weather?"), + new(ChatRole.Assistant, new List + { + new FunctionCallContent("c1", "get_weather", new Dictionary { ["city"] = "Seattle" }), + }), + new(ChatRole.Tool, new List + { + new FunctionResultContent("c1", "62°F, cloudy"), + }), + new(ChatRole.Assistant, "Seattle is 62°F and cloudy."), + new(ChatRole.User, "Thanks!"), + new(ChatRole.Assistant, "You're welcome!"), + }; + + var item = new EvalItem("Thanks!", "You're welcome!", conversation); + + // Act + var (query, response) = item.Split(ConversationSplitters.LastTurn); + + // Assert — tool messages stay in query context + Assert.Equal(5, query.Count); + Assert.Equal(ChatRole.Tool, query[2].Role); + Assert.Single(response); + } + + [Fact] + public void ConversationSplitters_LastTurn_CanBeUsedAsCustomFallback() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + + // Act — use ConversationSplitters.LastTurn directly + var (query, response) = ConversationSplitters.LastTurn.Split(conversation); + + // Assert + Assert.Equal(5, query.Count); + Assert.Single(response); + } + + // --------------------------------------------------------------- + // PerTurnItems tests + // --------------------------------------------------------------- + + [Fact] + public void PerTurnItems_SplitsMultiTurnConversation() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + + // Act + var items = EvalItem.PerTurnItems(conversation); + + // Assert — 3 user messages = 3 items + Assert.Equal(3, items.Count); + + // First turn: "What's the weather in Seattle?" + Assert.Contains("Seattle", items[0].Query); + Assert.Contains("62°F", items[0].Response); + Assert.Equal(2, items[0].Conversation.Count); + + // Second turn: "And Paris?" + Assert.Contains("Paris", items[1].Query); + Assert.Contains("68°F", items[1].Response); + Assert.Equal(4, items[1].Conversation.Count); + + // Third turn: "Compare them." + Assert.Contains("Compare", items[2].Query); + Assert.Contains("cooler", items[2].Response); + Assert.Equal(6, items[2].Conversation.Count); + } + + [Fact] + public void PerTurnItems_PropagatesToolsAndContext() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + + // Act + var items = EvalItem.PerTurnItems( + conversation, + context: "Weather database"); + + // Assert + Assert.All(items, item => Assert.Equal("Weather database", item.Context)); + } + + [Fact] + public void PerTurnItems_SingleTurn_ReturnsOneItem() + { + // Arrange + var conversation = new List + { + new(ChatRole.User, "Hello"), + new(ChatRole.Assistant, "Hi there!"), + }; + + // Act + var items = EvalItem.PerTurnItems(conversation); + + // Assert + Assert.Single(items); + Assert.Equal("Hello", items[0].Query); + Assert.Equal("Hi there!", items[0].Response); + } + + // --------------------------------------------------------------- + // Custom IConversationSplitter tests + // --------------------------------------------------------------- + + [Fact] + public void Split_CustomSplitter_IsUsed() + { + // Arrange — splitter that splits before a tool call message + var conversation = new List + { + new(ChatRole.User, "Remember this"), + new(ChatRole.Assistant, "Storing..."), + new(ChatRole.User, "What did I say?"), + new(ChatRole.Assistant, new List + { + new FunctionCallContent("c1", "retrieve_memory"), + }), + new(ChatRole.Tool, new List + { + new FunctionResultContent("c1", "You said: Remember this"), + }), + new(ChatRole.Assistant, "You said 'Remember this'."), + }; + + var splitter = new MemorySplitter(); + var item = new EvalItem("What did I say?", "You said 'Remember this'.", conversation); + + // Act + var (query, response) = item.Split(splitter); + + // Assert — split before the tool call + Assert.Equal(3, query.Count); + Assert.Equal(3, response.Count); + } + + [Fact] + public void Split_CustomSplitter_WorksAsItemProperty() + { + // Arrange — custom splitter set on the item (simulating call-site override) + var conversation = new List + { + new(ChatRole.User, "Remember this"), + new(ChatRole.Assistant, "Storing..."), + new(ChatRole.User, "What did I say?"), + new(ChatRole.Assistant, new List + { + new FunctionCallContent("c1", "retrieve_memory"), + }), + new(ChatRole.Tool, new List + { + new FunctionResultContent("c1", "You said: Remember this"), + }), + new(ChatRole.Assistant, "You said 'Remember this'."), + }; + + var item = new EvalItem("What did I say?", "You said 'Remember this'.", conversation) + { + Splitter = new MemorySplitter(), + }; + + // Act — no explicit splitter, uses item.Splitter + var (query, response) = item.Split(); + + // Assert — custom splitter was used + Assert.Equal(3, query.Count); + Assert.Equal(3, response.Count); + } + + private sealed class MemorySplitter : IConversationSplitter + { + public (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split( + IReadOnlyList conversation) + { + for (int i = 0; i < conversation.Count; i++) + { + var msg = conversation[i]; + if (msg.Role == ChatRole.Assistant && msg.Contents != null) + { + foreach (var content in msg.Contents) + { + if (content is FunctionCallContent fc && fc.Name == "retrieve_memory") + { + return ( + conversation.Take(i).ToList(), + conversation.Skip(i).ToList()); + } + } + } + } + + // Fallback to last-turn split + return ConversationSplitters.LastTurn.Split(conversation); + } + } + + // --------------------------------------------------------------- + // ExpectedToolCall tests + // --------------------------------------------------------------- + + [Fact] + public void ExpectedToolCall_NameOnly() + { + var tc = new ExpectedToolCall("get_weather"); + Assert.Equal("get_weather", tc.Name); + Assert.Null(tc.Arguments); + } + + [Fact] + public void ExpectedToolCall_NameAndArgs() + { + var args = new Dictionary { ["location"] = "NYC" }; + var tc = new ExpectedToolCall("get_weather", args); + Assert.Equal("get_weather", tc.Name); + Assert.NotNull(tc.Arguments); + Assert.Equal("NYC", tc.Arguments["location"]); + } + + [Fact] + public void EvalItem_ExpectedToolCalls_DefaultNull() + { + var item = CreateItem(); + Assert.Null(item.ExpectedToolCalls); + } + + [Fact] + public void EvalItem_ExpectedToolCalls_CanBeSet() + { + var item = CreateItem(); + var calls = new List + { + new("get_weather", new Dictionary { ["location"] = "NYC" }), + new("book_flight"), + }; + item.ExpectedToolCalls = calls; + + Assert.NotNull(item.ExpectedToolCalls); + Assert.Equal(2, item.ExpectedToolCalls.Count); + Assert.Equal("get_weather", item.ExpectedToolCalls[0].Name); + Assert.Null(item.ExpectedToolCalls[1].Arguments); + } + + [Fact] + public async Task LocalEvaluator_PopulatesInputItems_ForAuditing() + { + // Arrange + var check = FunctionEvaluator.Create("is_sunny", + (string response) => response.Contains("sunny", StringComparison.OrdinalIgnoreCase)); + + var evaluator = new LocalEvaluator(check); + var items = new List + { + CreateItem(query: "Weather?", response: "It's sunny!"), + CreateItem(query: "Temp?", response: "72 degrees"), + }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert — InputItems carries the original query/response for auditing + Assert.NotNull(results.InputItems); + Assert.Equal(2, results.InputItems.Count); + Assert.Equal("Weather?", results.InputItems[0].Query); + Assert.Equal("It's sunny!", results.InputItems[0].Response); + Assert.Equal("Temp?", results.InputItems[1].Query); + Assert.Equal("72 degrees", results.InputItems[1].Response); + + // Results and InputItems are positionally correlated + Assert.Equal(results.Items.Count, results.InputItems.Count); + } +} From 704e8041bce361bef3c4564fac8e97a89feba8a1 Mon Sep 17 00:00:00 2001 From: alliscode Date: Thu, 26 Mar 2026 10:16:13 -0700 Subject: [PATCH 2/4] Fix net472 build: conditional evaluation package refs and file exclusion - Make Microsoft.Extensions.AI.Evaluation package references conditional on net8.0+ in AI, AzureAI, and Workflows csproj files - Exclude Evaluation/**/*.cs from compilation on legacy TFMs (net472, netstandard2.0) since MEAI.Evaluation does not support them - Fix missing numRepetitions XML doc params in AgentEvaluationExtensions - Fix expectedOutput parameter name bug in BuildItemsFromResponses call Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- dotnet/agent-framework-dotnet.slnx | 595 ++++++++++-------- .../Program.cs | 1 - ...ents_Evaluations_Step03_AllPatterns.csproj | 0 .../Program.cs | 8 +- .../README.md | 0 .../Evaluation/FoundryEvals.cs | 1 - .../Microsoft.Agents.AI.AzureAI.csproj | 11 +- .../Microsoft.Agents.AI.Workflows.csproj | 5 + .../Evaluation/AgentEvaluationExtensions.cs | 12 +- .../Evaluation/EvalItem.cs | 1 - .../Microsoft.Agents.AI.csproj | 9 +- .../EvaluationTests.cs | 5 +- .../Microsoft.Agents.AI.UnitTests.csproj | 5 + 13 files changed, 369 insertions(+), 284 deletions(-) rename dotnet/samples/{GettingStarted => 02-agents}/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj (100%) rename dotnet/samples/{GettingStarted => 02-agents}/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs (98%) rename dotnet/samples/{GettingStarted => 02-agents}/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md (100%) diff --git a/dotnet/agent-framework-dotnet.slnx b/dotnet/agent-framework-dotnet.slnx index 6e75bd355b..1c47bfe2b5 100644 --- a/dotnet/agent-framework-dotnet.slnx +++ b/dotnet/agent-framework-dotnet.slnx @@ -5,230 +5,214 @@ + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -236,60 +220,118 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -298,7 +340,6 @@ - @@ -321,6 +362,10 @@ + + + + @@ -331,6 +376,10 @@ + + + + @@ -396,6 +445,10 @@ + + + + @@ -403,6 +456,10 @@ + + + + @@ -426,6 +483,7 @@ + @@ -433,11 +491,11 @@ - + @@ -451,11 +509,11 @@ + - @@ -472,19 +530,20 @@ + - + - \ No newline at end of file + diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs index ca1e8a06ad..aee7473f60 100644 --- a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs @@ -13,7 +13,6 @@ // https://learn.microsoft.com/dotnet/ai/evaluation/libraries using Azure.AI.Projects; -using Azure.AI.Projects.OpenAI; using Azure.Identity; using Microsoft.Agents.AI; using Microsoft.Extensions.AI; diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj similarity index 100% rename from dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj rename to dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs similarity index 98% rename from dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs rename to dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs index ec6531cf85..9f7c6e3e95 100644 --- a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft. All rights reserved. +// Copyright (c) Microsoft. All rights reserved. // This sample demonstrates all evaluation patterns available in Agent Framework for .NET. // It covers: @@ -13,10 +13,8 @@ // Mirrors the Python sample: evaluate_all_patterns_sample.py using Azure.AI.Projects; -using Azure.AI.Projects.OpenAI; using Azure.Identity; using Microsoft.Agents.AI; -using Microsoft.Agents.AI.AzureAI; using Microsoft.Extensions.AI; using Microsoft.Extensions.AI.Evaluation; using Microsoft.Extensions.AI.Evaluation.Quality; @@ -315,7 +313,7 @@ static void PrintResults(string title, AgentEvaluationResults results) /// Example custom splitter that splits before the first tool call. /// Evaluates whether the agent's tool usage and final response are appropriate. /// -sealed class WeatherToolSplitter : IConversationSplitter +internal sealed class WeatherToolSplitter : IConversationSplitter { public (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split( IReadOnlyList conversation) @@ -334,4 +332,4 @@ sealed class WeatherToolSplitter : IConversationSplitter // Fallback: use the default LastTurn split return ConversationSplitters.LastTurn.Split(conversation); } -} \ No newline at end of file +} diff --git a/dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md similarity index 100% rename from dotnet/samples/GettingStarted/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md rename to dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md diff --git a/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs b/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs index 6e650683d4..a68fc43cd8 100644 --- a/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs +++ b/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs @@ -1,6 +1,5 @@ // Copyright (c) Microsoft. All rights reserved. -using System.Linq; using Microsoft.Extensions.AI; using Microsoft.Extensions.AI.Evaluation; using Microsoft.Extensions.AI.Evaluation.Quality; diff --git a/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj b/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj index 2488a20519..41225c24ef 100644 --- a/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj +++ b/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj @@ -16,11 +16,18 @@ + + + + + + - - + + + diff --git a/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj b/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj index c103ead32d..0e4e20e47b 100644 --- a/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj +++ b/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj @@ -54,4 +54,9 @@ + + + + + diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs index cfb179ab98..9d0466c5c3 100644 --- a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft. All rights reserved. +// Copyright (c) Microsoft. All rights reserved. using System; using System.Collections.Generic; @@ -77,6 +77,10 @@ public static async Task EvaluateAsync( /// Use , , /// or a custom implementation. /// + /// + /// Number of times to run each query (default 1). When greater than 1, each query is invoked + /// independently N times to measure consistency. + /// /// Cancellation token. /// Evaluation results. public static async Task EvaluateAsync( @@ -113,6 +117,10 @@ public static async Task EvaluateAsync( /// Use , , /// or a custom implementation. /// + /// + /// Number of times to run each query (default 1). When greater than 1, each query is invoked + /// independently N times to measure consistency. + /// /// Cancellation token. /// One result per evaluator. public static async Task> EvaluateAsync( @@ -164,7 +172,7 @@ public static async Task EvaluateAsync( IEnumerable>? expectedToolCalls = null, CancellationToken cancellationToken = default) { - var items = BuildItemsFromResponses(agent, responses, queries, expected, expectedToolCalls); + var items = BuildItemsFromResponses(agent, responses, queries, expectedOutput, expectedToolCalls); return await evaluator.EvaluateAsync(items, evalName, cancellationToken).ConfigureAwait(false); } diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs index 31822ee172..93e860ae65 100644 --- a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs @@ -1,6 +1,5 @@ // Copyright (c) Microsoft. All rights reserved. -using System; using System.Collections.Generic; using System.Linq; using Microsoft.Extensions.AI; diff --git a/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj b/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj index 1b08d064ca..a111ce8c2d 100644 --- a/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj +++ b/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj @@ -23,7 +23,6 @@ - @@ -32,6 +31,14 @@ + + + + + + + + Microsoft Agent Framework diff --git a/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs b/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs index e5410bfb3a..603bf4d9ec 100644 --- a/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs +++ b/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs @@ -891,12 +891,11 @@ public void EvalItem_ExpectedToolCalls_DefaultNull() public void EvalItem_ExpectedToolCalls_CanBeSet() { var item = CreateItem(); - var calls = new List + item.ExpectedToolCalls = new List { new("get_weather", new Dictionary { ["location"] = "NYC" }), new("book_flight"), }; - item.ExpectedToolCalls = calls; Assert.NotNull(item.ExpectedToolCalls); Assert.Equal(2, item.ExpectedToolCalls.Count); @@ -905,7 +904,7 @@ public void EvalItem_ExpectedToolCalls_CanBeSet() } [Fact] - public async Task LocalEvaluator_PopulatesInputItems_ForAuditing() + public async Task LocalEvaluator_PopulatesInputItems_ForAuditingAsync() { // Arrange var check = FunctionEvaluator.Create("is_sunny", diff --git a/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj b/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj index ffa4417f34..a60c27a1c0 100644 --- a/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj +++ b/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj @@ -13,6 +13,11 @@ + + + + + From 307f7932ae66d947ba484cb5f197de6f0eb0185d Mon Sep 17 00:00:00 2001 From: alliscode Date: Thu, 26 Mar 2026 14:19:41 -0700 Subject: [PATCH 3/4] Address PR review comments: code fixes and comprehensive test coverage Code fixes: - Deduplicate ContentHarmEvaluator in BuildEvaluators (all safety names share one instance) - Throw ArgumentException on unknown evaluator names instead of silently ignoring - BuildEvalItem no longer mutates caller's messages list - AllPassed checks both SubResults and _items when SubResults is populated - Null guard for agent in sample finally blocks - Fix README type reference (Evaluators -> FoundryEvals) Test coverage: - BuildItemsFromResponses validation (mismatched queries/responses/expectedOutput/expectedToolCalls) - BuildEvaluators: quality names, safety deduplication, unknown name throws, default selection - AllPassed: empty items, SubResults with overall failure - BuildEvalItem: property correctness, input list not mutated - ExtractAgentData: empty events, matched pairs, unmatched invocations, completions without invocations, multiple agents, duplicate executor IDs, multiple rounds, null data, splitter propagation Infrastructure: - Made BuildItemsFromResponses, ExtractAgentData, BuildEvaluators internal for testability - Added InternalsVisibleTo for AI.UnitTests in AzureAI project - Added conditional AzureAI project reference in AI.UnitTests (net8.0+ only) - Added conditional compile exclusion for WorkflowEvaluationTests.cs on net472 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Program.cs | 12 +- .../Program.cs | 10 +- .../README.md | 2 +- .../Evaluation/FoundryEvals.cs | 28 ++- .../Microsoft.Agents.AI.AzureAI.csproj | 4 + .../WorkflowEvaluationExtensions.cs | 2 +- .../Evaluation/AgentEvaluationExtensions.cs | 11 +- .../Evaluation/AgentEvaluationResults.cs | 3 +- .../EvaluationTests.cs | 178 ++++++++++++++++++ .../Microsoft.Agents.AI.UnitTests.csproj | 5 + ...osoft.Agents.AI.Workflows.UnitTests.csproj | 5 + .../WorkflowEvaluationTests.cs | 156 +++++++++++++++ 12 files changed, 394 insertions(+), 22 deletions(-) create mode 100644 dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs index aee7473f60..9f7ad4be3a 100644 --- a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs @@ -53,7 +53,8 @@ originalChatConfiguration: new ChatConfiguration(chatClient)); // Create a test agent -AIAgent agent = await aiProjectClient.CreateAIAgentAsync( +AIAgent? agent = null; +agent = await aiProjectClient.CreateAIAgentAsync( name: "KnowledgeAgent", model: deploymentName, instructions: "You are a helpful assistant. Answer questions accurately based on the provided context."); @@ -91,9 +92,12 @@ 7. Enterprise-grade compliance and governance features finally { // Cleanup - await aiProjectClient.Agents.DeleteAgentAsync(agent.Name); - Console.WriteLine(); - Console.WriteLine("Cleanup: Agent deleted."); + if (agent is not null) + { + await aiProjectClient.Agents.DeleteAgentAsync(agent.Name); + Console.WriteLine(); + Console.WriteLine("Cleanup: Agent deleted."); + } } // ============================================================================ diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs index 9f7c6e3e95..4f5ea0c706 100644 --- a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs @@ -51,7 +51,8 @@ originalChatConfiguration: new ChatConfiguration(chatClient)); // Create test agent -AIAgent agent = await aiProjectClient.CreateAIAgentAsync( +AIAgent? agent = null; +agent = await aiProjectClient.CreateAIAgentAsync( name: "WeatherAgent", model: deploymentName, instructions: "You are a helpful weather assistant. Answer questions about weather accurately and concisely."); @@ -280,8 +281,11 @@ finally { // Cleanup - await aiProjectClient.Agents.DeleteAgentAsync(agent.Name); - Console.WriteLine("Cleanup: Agent deleted."); + if (agent is not null) + { + await aiProjectClient.Agents.DeleteAgentAsync(agent.Name); + Console.WriteLine("Cleanup: Agent deleted."); + } } // ============================================================================ diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md index d7b598a771..28eab9dd36 100644 --- a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md @@ -35,7 +35,7 @@ var local = new LocalEvaluator(check, keyword, toolCheck); var results = await agent.EvaluateAsync(queries, new RelevanceEvaluator(), chatConfig); // Foundry evaluator uses Azure AI Foundry cloud evaluation -var foundry = new FoundryEvals(chatConfig, Evaluators.Relevance, Evaluators.Coherence); +var foundry = new FoundryEvals(chatConfig, FoundryEvals.Relevance, FoundryEvals.Coherence); // Evaluate an agent AgentEvaluationResults localResults = await agent.EvaluateAsync(queries, local); diff --git a/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs b/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs index a68fc43cd8..a731af1099 100644 --- a/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs +++ b/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs @@ -189,29 +189,43 @@ public async Task EvaluateAsync( // Internal helpers // ----------------------------------------------------------------------- - private static List BuildEvaluators(string[] names) + internal static List BuildEvaluators(string[] names) { var evaluators = new List(); + bool hasSafetyEvaluator = false; foreach (var name in names) { - var evaluator = name switch + IEvaluator? evaluator = name switch { Relevance => new RelevanceEvaluator(), Coherence => new CoherenceEvaluator(), Groundedness => new GroundednessEvaluator(), - Fluency => (IEvaluator)new FluencyEvaluator(), + Fluency => new FluencyEvaluator(), - // Safety evaluators + // ContentHarmEvaluator covers all harm categories in one call — deduplicate Violence or Sexual or SelfHarm or - HateUnfairness => new ContentHarmEvaluator(), + HateUnfairness when !hasSafetyEvaluator => new ContentHarmEvaluator(), - // Agent evaluators not yet available in MEAI — log warning and skip - _ => null, + Violence or + Sexual or + SelfHarm or + HateUnfairness => null, + + _ => throw new ArgumentException( + $"Evaluator '{name}' is not supported by the .NET FoundryEvals adapter. " + + $"Supported: {Relevance}, {Coherence}, {Groundedness}, {Fluency}, " + + $"{Violence}, {Sexual}, {SelfHarm}, {HateUnfairness}.", + nameof(names)), }; + if (evaluator is ContentHarmEvaluator) + { + hasSafetyEvaluator = true; + } + if (evaluator is not null) { evaluators.Add(evaluator); diff --git a/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj b/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj index 41225c24ef..fce34b7201 100644 --- a/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj +++ b/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj @@ -30,6 +30,10 @@ + + + + diff --git a/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs b/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs index d404b182bc..badf6ff642 100644 --- a/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs +++ b/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs @@ -90,7 +90,7 @@ public static async Task EvaluateAsync( return overallResult; } - private static Dictionary> ExtractAgentData( + internal static Dictionary> ExtractAgentData( List events, IConversationSplitter? splitter) { diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs index 9d0466c5c3..31904218ad 100644 --- a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs @@ -208,7 +208,7 @@ public static async Task EvaluateAsync( return await agent.EvaluateAsync(responses, queries, wrapped, evalName, expectedOutput, expectedToolCalls, cancellationToken).ConfigureAwait(false); } - private static List BuildItemsFromResponses( + internal static List BuildItemsFromResponses( AIAgent agent, IEnumerable responses, IEnumerable queries, @@ -336,16 +336,17 @@ internal static EvalItem BuildEvalItem( List messages, AIAgent agent) { - // Add response messages to conversation + // Build conversation from existing messages plus any new response messages + var conversation = new List(messages); foreach (var msg in response.Messages) { - if (!messages.Contains(msg)) + if (!conversation.Contains(msg)) { - messages.Add(msg); + conversation.Add(msg); } } - return new EvalItem(query, response.Text, messages) + return new EvalItem(query, response.Text, conversation) { RawResponse = new ChatResponse(response.Messages.LastOrDefault() ?? new ChatMessage(ChatRole.Assistant, response.Text)), diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs index 6406760c49..c46bc8046b 100644 --- a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs @@ -62,7 +62,8 @@ public bool AllPassed { if (this.SubResults is not null) { - return this.SubResults.Values.All(s => s.AllPassed); + return this.SubResults.Values.All(s => s.AllPassed) + && (this.Total == 0 || this.Failed == 0); } return this.Total > 0 && this.Failed == 0; diff --git a/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs b/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs index 603bf4d9ec..00c3519f3f 100644 --- a/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs +++ b/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs @@ -931,4 +931,182 @@ public async Task LocalEvaluator_PopulatesInputItems_ForAuditingAsync() // Results and InputItems are positionally correlated Assert.Equal(results.Items.Count, results.InputItems.Count); } + + // --------------------------------------------------------------- + // AgentEvaluationResults tests + // --------------------------------------------------------------- + + [Fact] + public void AllPassed_EmptyItems_NoSubResults_ReturnsFalseAsync() + { + var results = new AgentEvaluationResults("test", Array.Empty()); + Assert.False(results.AllPassed); + Assert.Equal(0, results.Total); + } + + [Fact] + public void AllPassed_SubResultsAllPass_OverallFails_ReturnsFalseAsync() + { + // Overall has a failing item + var failMetric = new BooleanMetric("check", false) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Unacceptable, + Failed = true, + }, + }; + var failResult = new EvaluationResult(); + failResult.Metrics["check"] = failMetric; + + var overall = new AgentEvaluationResults("test", new[] { failResult }); + + // Sub-results all pass + var passMetric = new BooleanMetric("check", true) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Good, + Failed = false, + }, + }; + var passResult = new EvaluationResult(); + passResult.Metrics["check"] = passMetric; + + overall.SubResults = new Dictionary + { + ["agent1"] = new AgentEvaluationResults("sub", new[] { passResult }), + }; + + // Overall has a failing item, so AllPassed should be false + Assert.False(overall.AllPassed); + } + + // --------------------------------------------------------------- + // BuildItemsFromResponses validation tests + // --------------------------------------------------------------- + + [Fact] + public void BuildEvalItem_SetsPropertiesCorrectly() + { + var userMsg = new ChatMessage(ChatRole.User, "test query"); + var assistantMsg = new ChatMessage(ChatRole.Assistant, "response"); + var inputMessages = new List { userMsg }; + var response = new AgentResponse(assistantMsg); + + var item = AgentEvaluationExtensions.BuildEvalItem("test query", response, inputMessages, null!); + + Assert.Equal("test query", item.Query); + Assert.NotNull(item.RawResponse); + } + + [Fact] + public void BuildEvalItem_DoesNotMutateInputMessages() + { + // Arrange + var userMsg = new ChatMessage(ChatRole.User, "hello"); + var assistantMsg = new ChatMessage(ChatRole.Assistant, "world"); + var inputMessages = new List { userMsg }; + var response = new AgentResponse(assistantMsg); + + // Act + var item = AgentEvaluationExtensions.BuildEvalItem("hello", response, inputMessages, null!); + + // Assert — input list is not mutated + Assert.Single(inputMessages); + Assert.Equal(userMsg, inputMessages[0]); + + // But the EvalItem's conversation includes the response message + Assert.Equal(2, item.Conversation.Count); + } + + // --------------------------------------------------------------- + // BuildItemsFromResponses validation tests + // --------------------------------------------------------------- + + [Fact] + public void BuildItemsFromResponses_MismatchedQueryAndResponseCount_Throws() + { + var queries = new[] { "q1", "q2" }; + var responses = new[] { new AgentResponse(new ChatMessage(ChatRole.Assistant, "a1")) }; + + var ex = Assert.Throws( + () => AgentEvaluationExtensions.BuildItemsFromResponses(null!, responses, queries, null, null)); + Assert.Contains("queries", ex.Message); + Assert.Contains("responses", ex.Message); + } + + [Fact] + public void BuildItemsFromResponses_MismatchedExpectedOutput_Throws() + { + var queries = new[] { "q1" }; + var responses = new[] { new AgentResponse(new ChatMessage(ChatRole.Assistant, "a1")) }; + var expectedOutput = new[] { "e1", "e2" }; + + var ex = Assert.Throws( + () => AgentEvaluationExtensions.BuildItemsFromResponses(null!, responses, queries, expectedOutput, null)); + Assert.Contains("expectedOutput", ex.Message); + } + + [Fact] + public void BuildItemsFromResponses_MismatchedExpectedToolCalls_Throws() + { + var queries = new[] { "q1" }; + var responses = new[] { new AgentResponse(new ChatMessage(ChatRole.Assistant, "a1")) }; + var expectedToolCalls = new[] { new[] { new ExpectedToolCall("t1") }, new[] { new ExpectedToolCall("t2") } }; + + var ex = Assert.Throws( + () => AgentEvaluationExtensions.BuildItemsFromResponses( + null!, responses, queries, null, expectedToolCalls)); + Assert.Contains("expectedToolCalls", ex.Message); + } + + // --------------------------------------------------------------- + // FoundryEvals.BuildEvaluators tests + // --------------------------------------------------------------- + + [Fact] + public void BuildEvaluators_QualityNames_ReturnsDistinctEvaluators() + { + var evaluators = AzureAI.FoundryEvals.BuildEvaluators( + new[] { AzureAI.FoundryEvals.Relevance, AzureAI.FoundryEvals.Coherence }); + + Assert.Equal(2, evaluators.Count); + } + + [Fact] + public void BuildEvaluators_MultipleSafetyNames_SingleContentHarmEvaluator() + { + var evaluators = AzureAI.FoundryEvals.BuildEvaluators( + new[] + { + AzureAI.FoundryEvals.Violence, + AzureAI.FoundryEvals.Sexual, + AzureAI.FoundryEvals.SelfHarm, + AzureAI.FoundryEvals.HateUnfairness, + }); + + // All four safety names produce exactly one ContentHarmEvaluator + Assert.Single(evaluators); + } + + [Fact] + public void BuildEvaluators_UnknownName_ThrowsArgumentException() + { + var names = new[] { "gobblygook" }; + var ex = Assert.Throws( + () => AzureAI.FoundryEvals.BuildEvaluators(names)); + Assert.Contains("gobblygook", ex.Message); + Assert.Contains("not supported", ex.Message, StringComparison.OrdinalIgnoreCase); + } + + [Fact] + public void BuildEvaluators_DefaultSelection_ReturnsRelevanceAndCoherence() + { + // Default evaluator names when constructor receives empty array + var defaults = new[] { AzureAI.FoundryEvals.Relevance, AzureAI.FoundryEvals.Coherence }; + var evaluators = AzureAI.FoundryEvals.BuildEvaluators(defaults); + + Assert.Equal(2, evaluators.Count); + } } diff --git a/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj b/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj index a60c27a1c0..8e1dba18bd 100644 --- a/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj +++ b/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj @@ -13,6 +13,11 @@ + + + + + diff --git a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj index 58979a4f1b..6adedab6c3 100644 --- a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj +++ b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj @@ -4,6 +4,11 @@ $(NoWarn);MEAI001 + + + + + diff --git a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs new file mode 100644 index 0000000000..1ab7e71a82 --- /dev/null +++ b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs @@ -0,0 +1,156 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; + +namespace Microsoft.Agents.AI.Workflows.UnitTests; + +/// +/// Tests for . +/// +public sealed class WorkflowEvaluationTests +{ + [Fact] + public void ExtractAgentData_EmptyEvents_ReturnsEmpty() + { + var result = WorkflowEvaluationExtensions.ExtractAgentData(new List(), splitter: null); + + Assert.Empty(result); + } + + [Fact] + public void ExtractAgentData_MatchedPair_ReturnsItem() + { + var events = new List + { + new ExecutorInvokedEvent("agent-1", "What is the weather?"), + new ExecutorCompletedEvent("agent-1", "It's sunny."), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Single(result); + Assert.True(result.ContainsKey("agent-1")); + Assert.Single(result["agent-1"]); + Assert.Equal("What is the weather?", result["agent-1"][0].Query); + Assert.Equal("It's sunny.", result["agent-1"][0].Response); + Assert.Equal(2, result["agent-1"][0].Conversation.Count); + } + + [Fact] + public void ExtractAgentData_UnmatchedInvocation_NotIncluded() + { + // An invocation without a matching completion should not appear in results + var events = new List + { + new ExecutorInvokedEvent("agent-1", "Hello"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Empty(result); + } + + [Fact] + public void ExtractAgentData_CompletionWithoutInvocation_NotIncluded() + { + // A completion without a prior invocation should not appear in results + var events = new List + { + new ExecutorCompletedEvent("agent-1", "Response"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Empty(result); + } + + [Fact] + public void ExtractAgentData_MultipleAgents_SeparatedByExecutorId() + { + var events = new List + { + new ExecutorInvokedEvent("agent-1", "Q1"), + new ExecutorInvokedEvent("agent-2", "Q2"), + new ExecutorCompletedEvent("agent-1", "A1"), + new ExecutorCompletedEvent("agent-2", "A2"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Equal(2, result.Count); + Assert.Equal("Q1", result["agent-1"][0].Query); + Assert.Equal("A1", result["agent-1"][0].Response); + Assert.Equal("Q2", result["agent-2"][0].Query); + Assert.Equal("A2", result["agent-2"][0].Response); + } + + [Fact] + public void ExtractAgentData_DuplicateExecutorId_LastInvocationUsed() + { + // If the same executor is invoked twice before completing, + // the second invocation overwrites the first + var events = new List + { + new ExecutorInvokedEvent("agent-1", "First question"), + new ExecutorInvokedEvent("agent-1", "Second question"), + new ExecutorCompletedEvent("agent-1", "Answer"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Single(result); + Assert.Single(result["agent-1"]); + Assert.Equal("Second question", result["agent-1"][0].Query); + } + + [Fact] + public void ExtractAgentData_MultipleRoundsForSameExecutor_AllCaptured() + { + // Same executor invoked→completed twice (sequential rounds) + var events = new List + { + new ExecutorInvokedEvent("agent-1", "Q1"), + new ExecutorCompletedEvent("agent-1", "A1"), + new ExecutorInvokedEvent("agent-1", "Q2"), + new ExecutorCompletedEvent("agent-1", "A2"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Single(result); // one executor + Assert.Equal(2, result["agent-1"].Count); // two items + Assert.Equal("Q1", result["agent-1"][0].Query); + Assert.Equal("Q2", result["agent-1"][1].Query); + } + + [Fact] + public void ExtractAgentData_NullData_UsesEmptyString() + { + var events = new List + { + new ExecutorInvokedEvent("agent-1", null!), + new ExecutorCompletedEvent("agent-1", null), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Single(result); + Assert.Equal(string.Empty, result["agent-1"][0].Query); + Assert.Equal(string.Empty, result["agent-1"][0].Response); + } + + [Fact] + public void ExtractAgentData_WithSplitter_SetOnItems() + { + var splitter = ConversationSplitters.LastTurn; + var events = new List + { + new ExecutorInvokedEvent("agent-1", "Q"), + new ExecutorCompletedEvent("agent-1", "A"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter); + + Assert.Equal(splitter, result["agent-1"][0].Splitter); + } +} From bb89fd132a3bb3e7bfe837c68c2d7110a70c47b8 Mon Sep 17 00:00:00 2001 From: alliscode Date: Thu, 26 Mar 2026 14:28:38 -0700 Subject: [PATCH 4/4] Fix MeaiEvaluatorAdapter: split conversation before evaluating Use item.Split() to separate query messages from the response, matching what FoundryEvals does. Previously the full conversation (including assistant turns) was passed as 'messages' alongside chatResponse, feeding duplicate assistant context to the evaluator and corrupting quality scores. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs index df58ebcd7c..e2a6ea67e4 100644 --- a/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs @@ -44,7 +44,8 @@ public async Task EvaluateAsync( { cancellationToken.ThrowIfCancellationRequested(); - var messages = item.Conversation.ToList(); + var (queryMessages, _) = item.Split(); + var messages = queryMessages.ToList(); var chatResponse = item.RawResponse ?? new ChatResponse(new ChatMessage(ChatRole.Assistant, item.Response));