diff --git a/dotnet/agent-framework-dotnet.slnx b/dotnet/agent-framework-dotnet.slnx index a4ffe13958..1c47bfe2b5 100644 --- a/dotnet/agent-framework-dotnet.slnx +++ b/dotnet/agent-framework-dotnet.slnx @@ -147,6 +147,7 @@ + diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj index 646cd75532..8b6a7d5001 100644 --- a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj @@ -9,7 +9,6 @@ - diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs index 8f8c9fa4ee..9f7ad4be3a 100644 --- a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs @@ -12,7 +12,6 @@ // For more details, see: // https://learn.microsoft.com/dotnet/ai/evaluation/libraries -using Azure.AI.OpenAI; using Azure.AI.Projects; using Azure.Identity; using Microsoft.Agents.AI; @@ -24,26 +23,25 @@ using ChatMessage = Microsoft.Extensions.AI.ChatMessage; using ChatRole = Microsoft.Extensions.AI.ChatRole; -string endpoint = Environment.GetEnvironmentVariable("AZURE_AI_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_AI_PROJECT_ENDPOINT is not set."); -string deploymentName = Environment.GetEnvironmentVariable("AZURE_AI_MODEL_DEPLOYMENT_NAME") ?? "gpt-4o-mini"; -string openAiEndpoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT") ?? throw new InvalidOperationException("AZURE_OPENAI_ENDPOINT is not set."); -string evaluatorDeploymentName = Environment.GetEnvironmentVariable("AZURE_OPENAI_DEPLOYMENT_NAME") ?? deploymentName; +string endpoint = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_FOUNDRY_PROJECT_ENDPOINT is not set."); +string deploymentName = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME") ?? "gpt-4o-mini"; Console.WriteLine("=" + new string('=', 79)); Console.WriteLine("SELF-REFLECTION EVALUATION SAMPLE"); Console.WriteLine("=" + new string('=', 79)); Console.WriteLine(); -// Initialize Azure credentials and client +// Initialize Azure credentials and client — everything derives from the project endpoint // WARNING: DefaultAzureCredential is convenient for development but requires careful consideration in production. // In production, consider using a specific credential (e.g., ManagedIdentityCredential) to avoid // latency issues, unintended credential probing, and potential security risks from fallback mechanisms. DefaultAzureCredential credential = new(); AIProjectClient aiProjectClient = new(new Uri(endpoint), credential); -// Set up the LLM-based chat client for quality evaluators -IChatClient chatClient = new AzureOpenAIClient(new Uri(openAiEndpoint), credential) - .GetChatClient(evaluatorDeploymentName) +// Get a chat client for LLM-based evaluators from the project client +IChatClient chatClient = aiProjectClient + .GetProjectOpenAIClient() + .GetChatClient(deploymentName) .AsIChatClient(); // Configure evaluation: quality evaluators use the LLM, safety evaluators use Azure AI Foundry @@ -55,7 +53,8 @@ originalChatConfiguration: new ChatConfiguration(chatClient)); // Create a test agent -AIAgent agent = await aiProjectClient.CreateAIAgentAsync( +AIAgent? agent = null; +agent = await aiProjectClient.CreateAIAgentAsync( name: "KnowledgeAgent", model: deploymentName, instructions: "You are a helpful assistant. Answer questions accurately based on the provided context."); @@ -93,9 +92,12 @@ 7. Enterprise-grade compliance and governance features finally { // Cleanup - await aiProjectClient.Agents.DeleteAgentAsync(agent.Name); - Console.WriteLine(); - Console.WriteLine("Cleanup: Agent deleted."); + if (agent is not null) + { + await aiProjectClient.Agents.DeleteAgentAsync(agent.Name); + Console.WriteLine(); + Console.WriteLine("Cleanup: Agent deleted."); + } } // ============================================================================ diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj new file mode 100644 index 0000000000..8b6a7d5001 --- /dev/null +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj @@ -0,0 +1,24 @@ + + + + Exe + net10.0 + + enable + enable + + + + + + + + + + + + + + + + diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs new file mode 100644 index 0000000000..4f5ea0c706 --- /dev/null +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs @@ -0,0 +1,339 @@ +// Copyright (c) Microsoft. All rights reserved. + +// This sample demonstrates all evaluation patterns available in Agent Framework for .NET. +// It covers: +// 1. Function evaluators — custom checks using lambdas +// 2. Built-in checks — keyword and tool-called validation +// 3. MEAI evaluators — LLM-based quality scoring (Relevance, Coherence, Groundedness) +// 4. Foundry evaluators — cloud-based evaluation with Azure AI Foundry +// 5. Mixed evaluators — combining local checks with cloud evaluation +// 6. Pre-existing response evaluation — evaluate responses without re-running the agent +// 7. Conversation split strategies — LastTurn, Full, PerTurn, and call-site override +// +// Mirrors the Python sample: evaluate_all_patterns_sample.py + +using Azure.AI.Projects; +using Azure.Identity; +using Microsoft.Agents.AI; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; +using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Safety; + +using ChatMessage = Microsoft.Extensions.AI.ChatMessage; +using ChatRole = Microsoft.Extensions.AI.ChatRole; +using FoundryEvals = Microsoft.Agents.AI.AzureAI.FoundryEvals; + +string endpoint = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_ENDPOINT") + ?? throw new InvalidOperationException("AZURE_FOUNDRY_PROJECT_ENDPOINT is not set."); +string deploymentName = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME") ?? "gpt-4o-mini"; + +Console.WriteLine("=" + new string('=', 79)); +Console.WriteLine("AGENT FRAMEWORK EVALUATION — ALL PATTERNS"); +Console.WriteLine("=" + new string('=', 79)); +Console.WriteLine(); + +// Initialize Azure credentials and clients — everything derives from the project endpoint +DefaultAzureCredential credential = new(); +AIProjectClient aiProjectClient = new(new Uri(endpoint), credential); + +// Get a chat client for LLM-based evaluators from the project client +IChatClient chatClient = aiProjectClient + .GetProjectOpenAIClient() + .GetChatClient(deploymentName) + .AsIChatClient(); + +ContentSafetyServiceConfiguration safetyConfig = new( + credential: credential, + endpoint: new Uri(endpoint)); + +ChatConfiguration chatConfiguration = safetyConfig.ToChatConfiguration( + originalChatConfiguration: new ChatConfiguration(chatClient)); + +// Create test agent +AIAgent? agent = null; +agent = await aiProjectClient.CreateAIAgentAsync( + name: "WeatherAgent", + model: deploymentName, + instructions: "You are a helpful weather assistant. Answer questions about weather accurately and concisely."); + +Console.WriteLine($"Created agent: {agent.Name}"); +Console.WriteLine(); + +string[] queries = ["What's the weather in Seattle?", "Is it going to rain in New York today?"]; + +try +{ + // ================================================================ + // Section 1: Function Evaluators + // ================================================================ + Console.WriteLine("SECTION 1: Function Evaluators"); + Console.WriteLine(new string('-', 60)); + + var functionEvaluator = new LocalEvaluator( + FunctionEvaluator.Create("is_concise", + (string response) => response.Split(' ').Length < 500), + FunctionEvaluator.Create("has_content", + (string response) => response.Length > 10), + FunctionEvaluator.Create("mentions_location", + (EvalItem item) => item.Response.Contains("Seattle", StringComparison.OrdinalIgnoreCase) + || item.Response.Contains("New York", StringComparison.OrdinalIgnoreCase))); + + AgentEvaluationResults functionResults = await agent.EvaluateAsync( + queries, + functionEvaluator); + + PrintResults("Function Evaluators", functionResults); + + // ================================================================ + // Section 2: Built-in Checks + // ================================================================ + Console.WriteLine("SECTION 2: Built-in Checks"); + Console.WriteLine(new string('-', 60)); + + var builtinEvaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("weather"), + EvalChecks.KeywordCheck(caseSensitive: false, "temperature", "forecast")); + + AgentEvaluationResults builtinResults = await agent.EvaluateAsync( + queries, + builtinEvaluator); + + PrintResults("Built-in Checks", builtinResults); + + // ================================================================ + // Section 3: MEAI Quality Evaluators + // ================================================================ + Console.WriteLine("SECTION 3: MEAI Quality Evaluators"); + Console.WriteLine(new string('-', 60)); + + // Pass MEAI evaluators directly — no adapter needed + AgentEvaluationResults meaiResults = await agent.EvaluateAsync( + queries, + new CompositeEvaluator( + new RelevanceEvaluator(), + new CoherenceEvaluator()), + chatConfiguration); + + PrintResults("MEAI Quality", meaiResults); + + // Print per-metric details for MEAI results + foreach (EvaluationResult itemResult in meaiResults.Items) + { + foreach (EvaluationMetric metric in itemResult.Metrics.Values) + { + if (metric is NumericMetric n) + { + string rating = n.Interpretation?.Rating.ToString() ?? "N/A"; + Console.WriteLine($" {n.Name,-20} Score: {n.Value:F1}/5 Rating: {rating}"); + } + } + } + + Console.WriteLine(); + + // ================================================================ + // Section 4: Foundry Evaluators (Cloud-based) + // ================================================================ + Console.WriteLine("SECTION 4: Foundry Evaluators"); + Console.WriteLine(new string('-', 60)); + + var foundryEvaluator = new FoundryEvals( + chatConfiguration, + FoundryEvals.Relevance, + FoundryEvals.Coherence, + FoundryEvals.Groundedness); + + AgentEvaluationResults foundryResults = await agent.EvaluateAsync( + queries, + foundryEvaluator); + + PrintResults("Foundry Evaluators", foundryResults); + + // ================================================================ + // Section 5: Mixed Evaluators (Local + Cloud) + // ================================================================ + Console.WriteLine("SECTION 5: Mixed Evaluators"); + Console.WriteLine(new string('-', 60)); + + IReadOnlyList mixedResults = await agent.EvaluateAsync( + queries, + evaluators: new IAgentEvaluator[] + { + new LocalEvaluator( + EvalChecks.KeywordCheck("weather"), + FunctionEvaluator.Create("not_empty", (string r) => r.Length > 0)), + new FoundryEvals(chatConfiguration, FoundryEvals.Relevance), + }); + + foreach (AgentEvaluationResults result in mixedResults) + { + PrintResults($"Mixed - {result.Provider}", result); + } + + // ================================================================ + // Section 6: Evaluate Pre-existing Responses + // ================================================================ + Console.WriteLine("SECTION 6: Evaluate Pre-existing Responses"); + Console.WriteLine(new string('-', 60)); + + // Get responses first + var savedQueries = new List(); + var savedResponses = new List(); + foreach (string query in queries) + { + AgentResponse response = await agent.RunAsync( + new List { new(ChatRole.User, query) }); + savedQueries.Add(query); + savedResponses.Add(response); + } + + // Evaluate the saved responses without re-running the agent + AgentEvaluationResults preExistingResults = await agent.EvaluateAsync( + savedResponses, + savedQueries, + new LocalEvaluator( + EvalChecks.KeywordCheck("weather"), + FunctionEvaluator.Create("response_quality", + (EvalItem item) => new EvalCheckResult( + item.Response.Length > 20, + item.Response.Length > 20 + ? "Response is detailed enough" + : "Response is too short", + "response_quality")))); + + PrintResults("Pre-existing Responses", preExistingResults); + + // ================================================================ + // Section 7: Conversation Split Strategies + // ================================================================ + Console.WriteLine("SECTION 7: Conversation Split Strategies"); + Console.WriteLine(new string('-', 60)); + + // Build a multi-turn conversation manually + var multiTurnConversation = new List + { + new(ChatRole.User, "What's the weather in Seattle?"), + new(ChatRole.Assistant, "Seattle is 62°F, cloudy with a chance of rain."), + new(ChatRole.User, "And Paris?"), + new(ChatRole.Assistant, "Paris is 68°F, partly sunny."), + new(ChatRole.User, "Compare them."), + new(ChatRole.Assistant, "Seattle is cooler at 62°F with rain likely, while Paris is warmer at 68°F and sunnier."), + }; + + // Strategy 1: LAST_TURN (default) — evaluates the final response + var lastTurnItem = new EvalItem( + "Compare them.", + "Seattle is cooler at 62°F with rain likely, while Paris is warmer at 68°F and sunnier.", + multiTurnConversation); + + var (lastQuery, lastResponse) = lastTurnItem.Split(ConversationSplitters.LastTurn); + Console.WriteLine($" LastTurn split: {lastQuery.Count} query msgs, {lastResponse.Count} response msgs"); + + // Strategy 2: FULL — evaluates the whole conversation trajectory + var fullItem = new EvalItem( + "What's the weather in Seattle?", + "Full conversation trajectory", + multiTurnConversation) + { + Splitter = ConversationSplitters.Full, + }; + + var (fullQuery, fullResponse) = fullItem.Split(); + Console.WriteLine($" Full split: {fullQuery.Count} query msgs, {fullResponse.Count} response msgs"); + + // Strategy 3: PER_TURN — one eval item per user turn + var perTurnItems = EvalItem.PerTurnItems(multiTurnConversation); + Console.WriteLine($" PerTurn split: {perTurnItems.Count} items from {multiTurnConversation.Count} messages"); + + foreach (var turnItem in perTurnItems) + { + Console.WriteLine($" Turn: \"{turnItem.Query}\" → {turnItem.Response.Length} chars"); + } + + // Evaluate per-turn items with a local evaluator + var splitEvaluator = new LocalEvaluator( + FunctionEvaluator.Create("has_response", (string r) => r.Length > 5)); + + AgentEvaluationResults perTurnResults = await splitEvaluator.EvaluateAsync( + perTurnItems.ToList()); + + PrintResults("Per-Turn Evaluation", perTurnResults); + + // Strategy 4: Call-site override with built-in splitter + AgentEvaluationResults fullSplitResults = await agent.EvaluateAsync( + queries, + new LocalEvaluator(EvalChecks.KeywordCheck("weather")), + splitter: ConversationSplitters.Full); + + PrintResults("Call-site Full Split", fullSplitResults); + + // Strategy 5: Custom splitter as call-site override + // Same parameter works for built-in and custom splitters + AgentEvaluationResults customSplitResults = await agent.EvaluateAsync( + queries, + new LocalEvaluator(EvalChecks.KeywordCheck("weather")), + splitter: new WeatherToolSplitter()); + + PrintResults("Custom Splitter Override", customSplitResults); + Console.WriteLine(); +} +finally +{ + // Cleanup + if (agent is not null) + { + await aiProjectClient.Agents.DeleteAgentAsync(agent.Name); + Console.WriteLine("Cleanup: Agent deleted."); + } +} + +// ============================================================================ +// Helper Functions +// ============================================================================ + +static void PrintResults(string title, AgentEvaluationResults results) +{ + string status = results.AllPassed ? "✓ ALL PASSED" : "✗ SOME FAILED"; + Console.WriteLine($" [{title}] {status} ({results.Passed}/{results.Total})"); + + if (results.SubResults is not null) + { + foreach (var (agentId, sub) in results.SubResults) + { + string subStatus = sub.AllPassed ? "✓" : "✗"; + Console.WriteLine($" {subStatus} {agentId}: {sub.Passed}/{sub.Total}"); + } + } + + Console.WriteLine(); +} + +// ============================================================================ +// Custom Splitter — demonstrates IConversationSplitter +// ============================================================================ + +/// +/// Example custom splitter that splits before the first tool call. +/// Evaluates whether the agent's tool usage and final response are appropriate. +/// +internal sealed class WeatherToolSplitter : IConversationSplitter +{ + public (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split( + IReadOnlyList conversation) + { + for (int i = 0; i < conversation.Count; i++) + { + if (conversation[i].Role == ChatRole.Assistant + && conversation[i].Contents.OfType().Any()) + { + return ( + conversation.Take(i).ToList(), + conversation.Skip(i).ToList()); + } + } + + // Fallback: use the default LastTurn split + return ConversationSplitters.LastTurn.Split(conversation); + } +} diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md new file mode 100644 index 0000000000..28eab9dd36 --- /dev/null +++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md @@ -0,0 +1,49 @@ +# Evaluation — All Patterns + +This sample demonstrates all evaluation patterns available in Agent Framework for .NET: + +| Section | Pattern | Description | +|---------|---------|-------------| +| 1 | **Function Evaluators** | Custom checks using C# lambdas via `FunctionEvaluator.Create()` | +| 2 | **Built-in Checks** | `EvalChecks.KeywordCheck()` and `EvalChecks.ToolCalledCheck()` | +| 3 | **MEAI Quality Evaluators** | LLM-based scoring with `RelevanceEvaluator`, `CoherenceEvaluator` | +| 4 | **Foundry Evaluators** | Cloud-based evaluation via `FoundryEvals` | +| 5 | **Mixed Evaluators** | Combining local checks with cloud evaluation in one call | +| 6 | **Pre-existing Responses** | Evaluate saved responses without re-running the agent | + +## Prerequisites + +- Azure AI Foundry project with a deployed model +- Set environment variables: + - `AZURE_FOUNDRY_PROJECT_ENDPOINT` — Your Azure AI Foundry project endpoint + - `AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME` — Model deployment name (default: `gpt-4o-mini`) + +## Key Types + +```csharp +// Custom function evaluators +var check = FunctionEvaluator.Create("name", (string response) => response.Length > 10); + +// Built-in checks +var keyword = EvalChecks.KeywordCheck("expected", "keywords"); +var toolCheck = EvalChecks.ToolCalledCheck("tool_name"); + +// Local evaluator runs checks without API calls +var local = new LocalEvaluator(check, keyword, toolCheck); + +// MEAI evaluators work directly — no adapter needed +var results = await agent.EvaluateAsync(queries, new RelevanceEvaluator(), chatConfig); + +// Foundry evaluator uses Azure AI Foundry cloud evaluation +var foundry = new FoundryEvals(chatConfig, FoundryEvals.Relevance, FoundryEvals.Coherence); + +// Evaluate an agent +AgentEvaluationResults localResults = await agent.EvaluateAsync(queries, local); +localResults.AssertAllPassed(); +``` + +## Running + +```bash +dotnet run --project FoundryAgents_Evaluations_Step03_AllPatterns.csproj +``` diff --git a/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs b/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs new file mode 100644 index 0000000000..a731af1099 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs @@ -0,0 +1,237 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; +using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Safety; + +namespace Microsoft.Agents.AI.AzureAI; + +/// +/// Azure AI Foundry evaluator provider with built-in evaluator name constants. +/// +/// +/// +/// Combines evaluator constants (e.g., , ) +/// with the implementation that maps them to MEAI evaluators. +/// +/// +/// When the Azure.AI.Projects .NET SDK adds native evaluation API support, this class +/// will be updated to use it for full parity with the Python FoundryEvals class. +/// +/// +public sealed class FoundryEvals : IAgentEvaluator +{ + private readonly ChatConfiguration _chatConfiguration; + private readonly string[] _evaluatorNames; + private readonly IConversationSplitter? _splitter; + + // ----------------------------------------------------------------------- + // Constructors + // ----------------------------------------------------------------------- + + /// + /// Initializes a new instance of the class. + /// + /// Chat configuration for the LLM-based evaluators. + /// + /// Names of evaluators to use (e.g., , ). + /// When empty, defaults to relevance and coherence. + /// + public FoundryEvals(ChatConfiguration chatConfiguration, params string[] evaluators) + : this(chatConfiguration, splitter: null, evaluators) + { + } + + /// + /// Initializes a new instance of the class with a default splitter. + /// + /// Chat configuration for the LLM-based evaluators. + /// + /// Default conversation splitter for multi-turn conversations. Overridden by + /// when set on individual items. + /// Use , , + /// or a custom implementation. + /// + /// + /// Names of evaluators to use (e.g., , ). + /// When empty, defaults to relevance and coherence. + /// + public FoundryEvals(ChatConfiguration chatConfiguration, IConversationSplitter? splitter, params string[] evaluators) + { + this._chatConfiguration = chatConfiguration; + this._splitter = splitter; + this._evaluatorNames = evaluators.Length > 0 + ? evaluators + : [Relevance, Coherence]; + } + + // ----------------------------------------------------------------------- + // IAgentEvaluator + // ----------------------------------------------------------------------- + + /// + public string Name => "FoundryEvals"; + + /// + public async Task EvaluateAsync( + IReadOnlyList items, + string evalName = "Foundry Eval", + CancellationToken cancellationToken = default) + { + var meaiEvaluators = BuildEvaluators(this._evaluatorNames); + var composite = new CompositeEvaluator(meaiEvaluators.ToArray()); + + var results = new List(items.Count); + + foreach (var item in items) + { + cancellationToken.ThrowIfCancellationRequested(); + + // Resolve splitter: item-level > evaluator-level > LastTurn default + var effectiveSplitter = item.Splitter ?? this._splitter; + var (queryMessages, _) = item.Split(effectiveSplitter); + var messages = queryMessages.ToList(); + + var chatResponse = item.RawResponse + ?? new ChatResponse(new ChatMessage(ChatRole.Assistant, item.Response)); + + var additionalContext = new List(); + + if (item.Context is not null) + { + additionalContext.Add(new GroundednessEvaluatorContext(item.Context)); + } + + var result = await composite.EvaluateAsync( + messages, + chatResponse, + this._chatConfiguration, + additionalContext: additionalContext.Count > 0 ? additionalContext : null, + cancellationToken: cancellationToken).ConfigureAwait(false); + + results.Add(result); + } + + return new AgentEvaluationResults(this.Name, results); + } + + // ----------------------------------------------------------------------- + // Evaluator name constants + // ----------------------------------------------------------------------- + + // Agent behavior + + /// Evaluates whether the agent correctly resolves user intent. + public const string IntentResolution = "intent_resolution"; + + /// Evaluates whether the agent adheres to its task instructions. + public const string TaskAdherence = "task_adherence"; + + /// Evaluates whether the agent completes the requested task. + public const string TaskCompletion = "task_completion"; + + /// Evaluates the efficiency of the agent's navigation to complete the task. + public const string TaskNavigationEfficiency = "task_navigation_efficiency"; + + // Tool usage + + /// Evaluates the accuracy of tool calls made by the agent. + public const string ToolCallAccuracy = "tool_call_accuracy"; + + /// Evaluates whether the agent selects the correct tools. + public const string ToolSelection = "tool_selection"; + + /// Evaluates the accuracy of inputs provided to tools. + public const string ToolInputAccuracy = "tool_input_accuracy"; + + /// Evaluates how well the agent uses tool outputs. + public const string ToolOutputUtilization = "tool_output_utilization"; + + /// Evaluates whether tool calls succeed. + public const string ToolCallSuccess = "tool_call_success"; + + // Quality + + /// Evaluates the coherence of the response. + public const string Coherence = "coherence"; + + /// Evaluates the fluency of the response. + public const string Fluency = "fluency"; + + /// Evaluates the relevance of the response to the query. + public const string Relevance = "relevance"; + + /// Evaluates whether the response is grounded in the provided context. + public const string Groundedness = "groundedness"; + + /// Evaluates the completeness of the response. + public const string ResponseCompleteness = "response_completeness"; + + /// Evaluates the similarity between the response and the expected output. + public const string Similarity = "similarity"; + + // Safety + + /// Evaluates the response for violent content. + public const string Violence = "violence"; + + /// Evaluates the response for sexual content. + public const string Sexual = "sexual"; + + /// Evaluates the response for self-harm content. + public const string SelfHarm = "self_harm"; + + /// Evaluates the response for hate or unfairness. + public const string HateUnfairness = "hate_unfairness"; + + // ----------------------------------------------------------------------- + // Internal helpers + // ----------------------------------------------------------------------- + + internal static List BuildEvaluators(string[] names) + { + var evaluators = new List(); + bool hasSafetyEvaluator = false; + + foreach (var name in names) + { + IEvaluator? evaluator = name switch + { + Relevance => new RelevanceEvaluator(), + Coherence => new CoherenceEvaluator(), + Groundedness => new GroundednessEvaluator(), + Fluency => new FluencyEvaluator(), + + // ContentHarmEvaluator covers all harm categories in one call — deduplicate + Violence or + Sexual or + SelfHarm or + HateUnfairness when !hasSafetyEvaluator => new ContentHarmEvaluator(), + + Violence or + Sexual or + SelfHarm or + HateUnfairness => null, + + _ => throw new ArgumentException( + $"Evaluator '{name}' is not supported by the .NET FoundryEvals adapter. " + + $"Supported: {Relevance}, {Coherence}, {Groundedness}, {Fluency}, " + + $"{Violence}, {Sexual}, {SelfHarm}, {HateUnfairness}.", + nameof(names)), + }; + + if (evaluator is ContentHarmEvaluator) + { + hasSafetyEvaluator = true; + } + + if (evaluator is not null) + { + evaluators.Add(evaluator); + } + } + + return evaluators; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj b/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj index 0cd8690126..fce34b7201 100644 --- a/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj +++ b/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj @@ -20,6 +20,20 @@ + + + + + + + + + + + + + + diff --git a/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs b/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs new file mode 100644 index 0000000000..badf6ff642 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs @@ -0,0 +1,135 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI.Workflows; + +/// +/// Extension methods for evaluating workflow runs. +/// +public static class WorkflowEvaluationExtensions +{ + /// + /// Evaluates a completed workflow run. + /// + /// The completed workflow run. + /// The evaluator to score results. + /// Whether to include an overall evaluation. + /// Whether to include per-agent breakdowns. + /// Display name for this evaluation run. + /// + /// Optional conversation splitter to apply to all items. + /// Use , , + /// or a custom implementation. + /// + /// Cancellation token. + /// Evaluation results with optional per-agent sub-results. + public static async Task EvaluateAsync( + this Run run, + IAgentEvaluator evaluator, + bool includeOverall = true, + bool includePerAgent = true, + string evalName = "Workflow Eval", + IConversationSplitter? splitter = null, + CancellationToken cancellationToken = default) + { + var events = run.OutgoingEvents.ToList(); + + // Extract per-agent data + var agentData = ExtractAgentData(events, splitter); + + // Build overall items from final output + var overallItems = new List(); + if (includeOverall) + { + var finalResponse = events.OfType().LastOrDefault(); + if (finalResponse is not null) + { + var firstInvoked = events.OfType().FirstOrDefault(); + var query = firstInvoked?.Data?.ToString() ?? string.Empty; + var conversation = new List + { + new(ChatRole.User, query), + new(ChatRole.Assistant, finalResponse.Response.Text), + }; + + overallItems.Add(new EvalItem(query, finalResponse.Response.Text, conversation) + { + Splitter = splitter, + }); + } + } + + // Evaluate overall + var overallResult = overallItems.Count > 0 + ? await evaluator.EvaluateAsync(overallItems, evalName, cancellationToken).ConfigureAwait(false) + : new AgentEvaluationResults(evaluator.Name, Array.Empty()); + + // Per-agent breakdown + if (includePerAgent && agentData.Count > 0) + { + var subResults = new Dictionary(); + + foreach (var kvp in agentData) + { + subResults[kvp.Key] = await evaluator.EvaluateAsync( + kvp.Value, + $"{evalName} - {kvp.Key}", + cancellationToken).ConfigureAwait(false); + } + + overallResult.SubResults = subResults; + } + + return overallResult; + } + + internal static Dictionary> ExtractAgentData( + List events, + IConversationSplitter? splitter) + { + var invoked = new Dictionary(); + var agentData = new Dictionary>(); + + foreach (var evt in events) + { + if (evt is ExecutorInvokedEvent invokedEvent) + { + invoked[invokedEvent.ExecutorId] = invokedEvent; + } + else if (evt is ExecutorCompletedEvent completedEvent + && invoked.TryGetValue(completedEvent.ExecutorId, out var matchingInvoked)) + { + var query = matchingInvoked.Data?.ToString() ?? string.Empty; + var responseText = completedEvent.Data?.ToString() ?? string.Empty; + var conversation = new List + { + new(ChatRole.User, query), + new(ChatRole.Assistant, responseText), + }; + + var item = new EvalItem(query, responseText, conversation) + { + Splitter = splitter, + }; + + if (!agentData.TryGetValue(completedEvent.ExecutorId, out var items)) + { + items = new List(); + agentData[completedEvent.ExecutorId] = items; + } + + items.Add(item); + invoked.Remove(completedEvent.ExecutorId); + } + } + + return agentData; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj b/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj index c103ead32d..0e4e20e47b 100644 --- a/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj +++ b/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj @@ -54,4 +54,9 @@ + + + + + diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs new file mode 100644 index 0000000000..31904218ad --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs @@ -0,0 +1,355 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI; + +/// +/// Extension methods for evaluating agents, responses, and workflow runs. +/// +public static partial class AgentEvaluationExtensions +{ + /// + /// Evaluates an agent by running it against test queries and scoring the responses. + /// + /// The agent to evaluate. + /// Test queries to send to the agent. + /// The evaluator to score responses. + /// Display name for this evaluation run. + /// + /// Optional ground-truth expected outputs, one per query. When provided, + /// must be the same length as . Each value is + /// stamped on the corresponding . + /// + /// + /// Optional expected tool calls, one list per query. When provided, + /// must be the same length as . Each list is + /// stamped on the corresponding . + /// + /// + /// Optional conversation splitter to apply to all items. + /// Use , , + /// or a custom implementation. + /// + /// + /// Number of times to run each query (default 1). When greater than 1, each query is invoked + /// independently N times to measure consistency. Results contain all N × queries.Count items. + /// + /// Cancellation token. + /// Evaluation results. + public static async Task EvaluateAsync( + this AIAgent agent, + IEnumerable queries, + IAgentEvaluator evaluator, + string evalName = "Agent Framework Eval", + IEnumerable? expectedOutput = null, + IEnumerable>? expectedToolCalls = null, + IConversationSplitter? splitter = null, + int numRepetitions = 1, + CancellationToken cancellationToken = default) + { + var items = await RunAgentForEvalAsync(agent, queries, expectedOutput, expectedToolCalls, splitter, numRepetitions, cancellationToken).ConfigureAwait(false); + return await evaluator.EvaluateAsync(items, evalName, cancellationToken).ConfigureAwait(false); + } + + /// + /// Evaluates an agent using an MEAI evaluator directly. + /// + /// The agent to evaluate. + /// Test queries to send to the agent. + /// The MEAI evaluator (e.g., RelevanceEvaluator, CompositeEvaluator). + /// Chat configuration for the MEAI evaluator (includes the judge model). + /// Display name for this evaluation run. + /// + /// Optional ground-truth expected outputs, one per query. + /// + /// + /// Optional expected tool calls, one list per query. + /// + /// + /// Optional conversation splitter to apply to all items. + /// Use , , + /// or a custom implementation. + /// + /// + /// Number of times to run each query (default 1). When greater than 1, each query is invoked + /// independently N times to measure consistency. + /// + /// Cancellation token. + /// Evaluation results. + public static async Task EvaluateAsync( + this AIAgent agent, + IEnumerable queries, + IEvaluator evaluator, + ChatConfiguration chatConfiguration, + string evalName = "Agent Framework Eval", + IEnumerable? expectedOutput = null, + IEnumerable>? expectedToolCalls = null, + IConversationSplitter? splitter = null, + int numRepetitions = 1, + CancellationToken cancellationToken = default) + { + var wrapped = new MeaiEvaluatorAdapter(evaluator, chatConfiguration); + return await agent.EvaluateAsync(queries, wrapped, evalName, expectedOutput, expectedToolCalls, splitter, numRepetitions, cancellationToken).ConfigureAwait(false); + } + + /// + /// Evaluates an agent by running it against test queries with multiple evaluators. + /// + /// The agent to evaluate. + /// Test queries to send to the agent. + /// The evaluators to score responses. + /// Display name for this evaluation run. + /// + /// Optional ground-truth expected outputs, one per query. + /// + /// + /// Optional expected tool calls, one list per query. + /// + /// + /// Optional conversation splitter to apply to all items. + /// Use , , + /// or a custom implementation. + /// + /// + /// Number of times to run each query (default 1). When greater than 1, each query is invoked + /// independently N times to measure consistency. + /// + /// Cancellation token. + /// One result per evaluator. + public static async Task> EvaluateAsync( + this AIAgent agent, + IEnumerable queries, + IEnumerable evaluators, + string evalName = "Agent Framework Eval", + IEnumerable? expectedOutput = null, + IEnumerable>? expectedToolCalls = null, + IConversationSplitter? splitter = null, + int numRepetitions = 1, + CancellationToken cancellationToken = default) + { + var items = await RunAgentForEvalAsync(agent, queries, expectedOutput, expectedToolCalls, splitter, numRepetitions, cancellationToken).ConfigureAwait(false); + + var results = new List(); + foreach (var evaluator in evaluators) + { + var result = await evaluator.EvaluateAsync(items, evalName, cancellationToken).ConfigureAwait(false); + results.Add(result); + } + + return results; + } + + /// + /// Evaluates pre-existing agent responses without re-running the agent. + /// + /// The agent (used for tool definitions). + /// Pre-existing agent responses. + /// The queries that produced each response (must match count). + /// The evaluator to score responses. + /// Display name for this evaluation run. + /// + /// Optional ground-truth expected outputs, one per query. + /// + /// + /// Optional expected tool calls, one list per query. + /// + /// Cancellation token. + /// Evaluation results. + public static async Task EvaluateAsync( + this AIAgent agent, + IEnumerable responses, + IEnumerable queries, + IAgentEvaluator evaluator, + string evalName = "Agent Framework Eval", + IEnumerable? expectedOutput = null, + IEnumerable>? expectedToolCalls = null, + CancellationToken cancellationToken = default) + { + var items = BuildItemsFromResponses(agent, responses, queries, expectedOutput, expectedToolCalls); + return await evaluator.EvaluateAsync(items, evalName, cancellationToken).ConfigureAwait(false); + } + + /// + /// Evaluates pre-existing agent responses using an MEAI evaluator directly. + /// + /// The agent (used for tool definitions). + /// Pre-existing agent responses. + /// The queries that produced each response (must match count). + /// The MEAI evaluator. + /// Chat configuration for the MEAI evaluator. + /// Display name for this evaluation run. + /// + /// Optional ground-truth expected outputs, one per query. + /// + /// + /// Optional expected tool calls, one list per query. + /// + /// Cancellation token. + /// Evaluation results. + public static async Task EvaluateAsync( + this AIAgent agent, + IEnumerable responses, + IEnumerable queries, + IEvaluator evaluator, + ChatConfiguration chatConfiguration, + string evalName = "Agent Framework Eval", + IEnumerable? expectedOutput = null, + IEnumerable>? expectedToolCalls = null, + CancellationToken cancellationToken = default) + { + var wrapped = new MeaiEvaluatorAdapter(evaluator, chatConfiguration); + return await agent.EvaluateAsync(responses, queries, wrapped, evalName, expectedOutput, expectedToolCalls, cancellationToken).ConfigureAwait(false); + } + + internal static List BuildItemsFromResponses( + AIAgent agent, + IEnumerable responses, + IEnumerable queries, + IEnumerable? expectedOutput, + IEnumerable>? expectedToolCalls) + { + var responseList = responses.ToList(); + var queryList = queries.ToList(); + var expectedList = expectedOutput?.ToList(); + var expectedToolCallsList = expectedToolCalls?.ToList(); + + if (responseList.Count != queryList.Count) + { + throw new ArgumentException( + $"Got {queryList.Count} queries but {responseList.Count} responses. Counts must match."); + } + + if (expectedList != null && expectedList.Count != queryList.Count) + { + throw new ArgumentException( + $"Got {queryList.Count} queries but {expectedList.Count} expectedOutput values. Counts must match."); + } + + if (expectedToolCallsList != null && expectedToolCallsList.Count != queryList.Count) + { + throw new ArgumentException( + $"Got {queryList.Count} queries but {expectedToolCallsList.Count} expectedToolCalls lists. Counts must match."); + } + + var items = new List(); + for (int i = 0; i < responseList.Count; i++) + { + var query = queryList[i]; + var response = responseList[i]; + + var messages = new List + { + new(ChatRole.User, query), + }; + messages.AddRange(response.Messages); + + var item = BuildEvalItem(query, response, messages, agent); + if (expectedList != null) + { + item.ExpectedOutput = expectedList[i]; + } + + if (expectedToolCallsList != null) + { + item.ExpectedToolCalls = expectedToolCallsList[i].ToList(); + } + + items.Add(item); + } + + return items; + } + + private static async Task> RunAgentForEvalAsync( + AIAgent agent, + IEnumerable queries, + IEnumerable? expectedOutput, + IEnumerable>? expectedToolCalls, + IConversationSplitter? splitter, + int numRepetitions, + CancellationToken cancellationToken) + { + if (numRepetitions < 1) + { + throw new ArgumentException($"numRepetitions must be >= 1, got {numRepetitions}.", nameof(numRepetitions)); + } + + var items = new List(); + var queryList = queries.ToList(); + var expectedList = expectedOutput?.ToList(); + var expectedToolCallsList = expectedToolCalls?.ToList(); + + if (expectedList != null && expectedList.Count != queryList.Count) + { + throw new ArgumentException( + $"Got {queryList.Count} queries but {expectedList.Count} expectedOutput values. Counts must match."); + } + + if (expectedToolCallsList != null && expectedToolCallsList.Count != queryList.Count) + { + throw new ArgumentException( + $"Got {queryList.Count} queries but {expectedToolCallsList.Count} expectedToolCalls lists. Counts must match."); + } + + for (int rep = 0; rep < numRepetitions; rep++) + { + for (int i = 0; i < queryList.Count; i++) + { + cancellationToken.ThrowIfCancellationRequested(); + + var query = queryList[i]; + var messages = new List + { + new(ChatRole.User, query), + }; + + var response = await agent.RunAsync(messages, cancellationToken: cancellationToken).ConfigureAwait(false); + var item = BuildEvalItem(query, response, messages, agent); + item.Splitter = splitter; + if (expectedList != null) + { + item.ExpectedOutput = expectedList[i]; + } + + if (expectedToolCallsList != null) + { + item.ExpectedToolCalls = expectedToolCallsList[i].ToList(); + } + + items.Add(item); + } + } + + return items; + } + + internal static EvalItem BuildEvalItem( + string query, + AgentResponse response, + List messages, + AIAgent agent) + { + // Build conversation from existing messages plus any new response messages + var conversation = new List(messages); + foreach (var msg in response.Messages) + { + if (!conversation.Contains(msg)) + { + conversation.Add(msg); + } + } + + return new EvalItem(query, response.Text, conversation) + { + RawResponse = new ChatResponse(response.Messages.LastOrDefault() + ?? new ChatMessage(ChatRole.Assistant, response.Text)), + }; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs new file mode 100644 index 0000000000..c46bc8046b --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs @@ -0,0 +1,127 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI; + +/// +/// Aggregate evaluation results across multiple items. +/// +public sealed class AgentEvaluationResults +{ + private readonly List _items; + + /// + /// Initializes a new instance of the class. + /// + /// Name of the evaluation provider. + /// Per-item MEAI evaluation results. + /// The original eval items that were evaluated, for auditing. + public AgentEvaluationResults(string provider, IEnumerable items, IReadOnlyList? inputItems = null) + { + this.Provider = provider; + this._items = new List(items); + this.InputItems = inputItems; + } + + /// Gets the evaluation provider name. + public string Provider { get; } + + /// Gets the portal URL for viewing results (Foundry only). + public Uri? ReportUrl { get; set; } + + /// Gets the per-item MEAI evaluation results. + public IReadOnlyList Items => this._items; + + /// + /// Gets the original eval items that produced these results, for auditing. + /// Each entry corresponds positionally to InputItems[i] + /// is the query/response that produced Items[i]. + /// + public IReadOnlyList? InputItems { get; } + + /// Gets per-agent results for workflow evaluations. + public IReadOnlyDictionary? SubResults { get; set; } + + /// Gets the number of items that passed. + public int Passed => this._items.Count(ItemPassed); + + /// Gets the number of items that failed. + public int Failed => this._items.Count(i => !ItemPassed(i)); + + /// Gets the total number of items evaluated. + public int Total => this._items.Count; + + /// Gets whether all items passed. + public bool AllPassed + { + get + { + if (this.SubResults is not null) + { + return this.SubResults.Values.All(s => s.AllPassed) + && (this.Total == 0 || this.Failed == 0); + } + + return this.Total > 0 && this.Failed == 0; + } + } + + /// + /// Asserts that all items passed. Throws on failure. + /// + /// Optional custom failure message. + /// Thrown when any items failed. + public void AssertAllPassed(string? message = null) + { + if (!this.AllPassed) + { + var detail = message ?? $"{this.Provider}: {this.Passed} passed, {this.Failed} failed out of {this.Total}."; + if (this.ReportUrl is not null) + { + detail += $" See {this.ReportUrl} for details."; + } + + if (this.SubResults is not null) + { + var failedAgents = this.SubResults + .Where(kvp => !kvp.Value.AllPassed) + .Select(kvp => kvp.Key); + detail += $" Failed agents: {string.Join(", ", failedAgents)}."; + } + + throw new InvalidOperationException(detail); + } + } + + private static bool ItemPassed(EvaluationResult result) + { + foreach (var metric in result.Metrics.Values) + { + if (metric.Interpretation?.Failed == true) + { + return false; + } + + if (metric is NumericMetric numeric && numeric.Value.HasValue) + { + if (numeric.Value.Value < 3.0) + { + return false; + } + } + else if (metric is BooleanMetric boolean && boolean.Value.HasValue) + { + if (!boolean.Value.Value) + { + return false; + } + } + } + + return result.Metrics.Count > 0; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/CheckResult.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/CheckResult.cs new file mode 100644 index 0000000000..46f47bb3c9 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/CheckResult.cs @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace Microsoft.Agents.AI; + +/// +/// Result of a single check on a single evaluation item. +/// +/// Whether the check passed. +/// Human-readable explanation. +/// Name of the check that produced this result. +public sealed record EvalCheckResult(bool Passed, string Reason, string CheckName); diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalCheck.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalCheck.cs new file mode 100644 index 0000000000..eae0750418 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalCheck.cs @@ -0,0 +1,10 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace Microsoft.Agents.AI; + +/// +/// Delegate for a synchronous evaluation check on a single item. +/// +/// The evaluation item. +/// The check result. +public delegate EvalCheckResult EvalCheck(EvalItem item); diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalChecks.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalChecks.cs new file mode 100644 index 0000000000..5dfa2da612 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalChecks.cs @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.AI; + +namespace Microsoft.Agents.AI; + +/// +/// Built-in check functions for common evaluation patterns. +/// +public static class EvalChecks +{ + /// + /// Creates a check that verifies the response contains all specified keywords. + /// + /// Keywords that must appear in the response. + /// An delegate. + public static EvalCheck KeywordCheck(params string[] keywords) + { + return KeywordCheck(caseSensitive: false, keywords); + } + + /// + /// Creates a check that verifies the response contains all specified keywords. + /// + /// Whether the comparison is case-sensitive. + /// Keywords that must appear in the response. + /// An delegate. + public static EvalCheck KeywordCheck(bool caseSensitive, params string[] keywords) + { + return (EvalItem item) => + { + var comparison = caseSensitive + ? StringComparison.Ordinal + : StringComparison.OrdinalIgnoreCase; + + var missing = keywords + .Where(kw => !item.Response.Contains(kw, comparison)) + .ToList(); + + var passed = missing.Count == 0; + var reason = passed + ? $"All keywords found: {string.Join(", ", keywords)}" + : $"Missing keywords: {string.Join(", ", missing)}"; + + return new EvalCheckResult(passed, reason, "keyword_check"); + }; + } + + /// + /// Creates a check that verifies specific tools were called in the conversation. + /// + /// Tool names that must appear in the conversation. + /// An delegate. + public static EvalCheck ToolCalledCheck(params string[] toolNames) + { + return (EvalItem item) => + { + var calledTools = new HashSet(StringComparer.OrdinalIgnoreCase); + + foreach (var message in item.Conversation) + { + foreach (var content in message.Contents) + { + if (content is FunctionCallContent functionCall) + { + calledTools.Add(functionCall.Name); + } + } + } + + var missing = toolNames + .Where(t => !calledTools.Contains(t)) + .ToList(); + + var passed = missing.Count == 0; + var reason = passed + ? $"All tools called: {string.Join(", ", toolNames)}" + : $"Missing tool calls: {string.Join(", ", missing)}"; + + return new EvalCheckResult(passed, reason, "tool_called_check"); + }; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs new file mode 100644 index 0000000000..93e860ae65 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs @@ -0,0 +1,140 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.AI; + +namespace Microsoft.Agents.AI; + +/// +/// Provider-agnostic data for a single evaluation item. +/// +public sealed class EvalItem +{ + /// + /// Initializes a new instance of the class. + /// + /// The user query. + /// The agent response text. + /// The full conversation as list. + public EvalItem(string query, string response, IReadOnlyList conversation) + { + this.Query = query; + this.Response = response; + this.Conversation = conversation; + } + + /// Gets the user query. + public string Query { get; } + + /// Gets the agent response text. + public string Response { get; } + + /// Gets the full conversation history. + public IReadOnlyList Conversation { get; } + + /// Gets or sets the tools available to the agent. + public IReadOnlyList? Tools { get; set; } + + /// Gets or sets grounding context for evaluation. + public string? Context { get; set; } + + /// Gets or sets the expected output for ground-truth comparison. + public string? ExpectedOutput { get; set; } + + /// + /// Gets or sets the expected tool calls for tool-correctness evaluation. + /// + /// + /// Each entry describes a tool call the agent should make. The evaluator + /// decides matching semantics (ordering, extras, argument checking). + /// See . + /// + public IReadOnlyList? ExpectedToolCalls { get; set; } + + /// Gets or sets the raw chat response for MEAI evaluators. + public ChatResponse? RawResponse { get; set; } + + /// + /// Gets or sets the conversation splitter for this item. + /// + /// + /// When set by orchestration functions (e.g. EvaluateAsync(splitter: ...)), + /// this is used as the default by . + /// Priority: explicit Split(splitter) argument > + /// > . + /// + public IConversationSplitter? Splitter { get; set; } + + /// + /// Splits the conversation into query messages and response messages. + /// + /// + /// The splitter to use. When null, uses + /// if set, otherwise . + /// + /// A tuple of (query messages, response messages). + public (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split( + IConversationSplitter? splitter = null) + { + var effective = splitter ?? this.Splitter ?? ConversationSplitters.LastTurn; + return effective.Split(this.Conversation); + } + + /// + /// Splits a multi-turn conversation into one per user turn. + /// + /// + /// Each user message starts a new turn. The resulting item has cumulative context: + /// query messages contain the full conversation up to and including that user message, + /// and the response is everything up to the next user message. + /// + /// The full conversation to split. + /// Optional tools available to the agent. + /// Optional grounding context. + /// A list of eval items, one per user turn. + public static IReadOnlyList PerTurnItems( + IReadOnlyList conversation, + IReadOnlyList? tools = null, + string? context = null) + { + var items = new List(); + var userIndices = new List(); + + for (int i = 0; i < conversation.Count; i++) + { + if (conversation[i].Role == ChatRole.User) + { + userIndices.Add(i); + } + } + + for (int t = 0; t < userIndices.Count; t++) + { + int userIdx = userIndices[t]; + int nextBoundary = t + 1 < userIndices.Count + ? userIndices[t + 1] + : conversation.Count; + + var responseMessages = conversation.Skip(userIdx + 1).Take(nextBoundary - userIdx - 1).ToList(); + + var query = conversation[userIdx].Text ?? string.Empty; + var responseText = string.Join( + " ", + responseMessages + .Where(m => m.Role == ChatRole.Assistant && !string.IsNullOrEmpty(m.Text)) + .Select(m => m.Text)); + + var fullSlice = conversation.Take(nextBoundary).ToList(); + var item = new EvalItem(query, responseText, fullSlice) + { + Tools = tools, + Context = context, + }; + + items.Add(item); + } + + return items; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/ExpectedToolCall.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/ExpectedToolCall.cs new file mode 100644 index 0000000000..9b30899df4 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/ExpectedToolCall.cs @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; + +namespace Microsoft.Agents.AI; + +/// +/// A tool call that an agent is expected to make. +/// +/// +/// Used with EvaluateAsync to assert that the agent called the correct tools. +/// The evaluator decides matching semantics (order, extras, argument checking); +/// this type is pure data. +/// +/// The tool/function name (e.g. "get_weather"). +/// +/// Expected arguments. null means "don't check arguments". +/// When provided, evaluators typically do subset matching (all expected keys must be present). +/// +public record ExpectedToolCall(string Name, IReadOnlyDictionary? Arguments = null); diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/FunctionEvaluator.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/FunctionEvaluator.cs new file mode 100644 index 0000000000..a9024c7750 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/FunctionEvaluator.cs @@ -0,0 +1,68 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; + +namespace Microsoft.Agents.AI; + +/// +/// Factory for creating delegates from typed lambda functions. +/// +public static class FunctionEvaluator +{ + /// + /// Creates a check from a function that takes the response text and returns a bool. + /// + /// Check name for reporting. + /// Function that returns true if the response passes. + public static EvalCheck Create(string name, Func check) + { + return (EvalItem item) => + { + var passed = check(item.Response); + return new EvalCheckResult(passed, passed ? "Passed" : "Failed", name); + }; + } + + /// + /// Creates a check from a function that takes response and expected text. + /// + /// Check name for reporting. + /// Function that returns true if the response passes. + public static EvalCheck Create(string name, Func check) + { + return (EvalItem item) => + { + var passed = check(item.Response, item.ExpectedOutput); + return new EvalCheckResult(passed, passed ? "Passed" : "Failed", name); + }; + } + + /// + /// Creates a check from a function that takes the full . + /// + /// Check name for reporting. + /// Function that returns true if the item passes. + public static EvalCheck Create(string name, Func check) + { + return (EvalItem item) => + { + var passed = check(item); + return new EvalCheckResult(passed, passed ? "Passed" : "Failed", name); + }; + } + + /// + /// Creates a check from a function that takes the full + /// and returns a . + /// + /// Check name (used as fallback if the result has no name). + /// Function that returns a full check result. + public static EvalCheck Create(string name, Func check) + { + return (EvalItem item) => + { + var result = check(item); + return result with { CheckName = result.CheckName ?? name }; + }; + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/IAgentEvaluator.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/IAgentEvaluator.cs new file mode 100644 index 0000000000..2dc84e35eb --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/IAgentEvaluator.cs @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Agents.AI; + +/// +/// Batch-oriented evaluator interface for agent evaluation. +/// +/// +/// Unlike MEAI's IEvaluator which evaluates one item at a time, +/// evaluates a batch of items. This enables +/// efficient cloud-based evaluation (e.g., Foundry) and aggregate result computation. +/// +public interface IAgentEvaluator +{ + /// Gets the evaluator name. + string Name { get; } + + /// + /// Evaluates a batch of items and returns aggregate results. + /// + /// The items to evaluate. + /// A display name for this evaluation run. + /// Cancellation token. + /// Aggregate evaluation results. + Task EvaluateAsync( + IReadOnlyList items, + string evalName = "Agent Framework Eval", + CancellationToken cancellationToken = default); +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/IConversationSplitter.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/IConversationSplitter.cs new file mode 100644 index 0000000000..f07282e4de --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/IConversationSplitter.cs @@ -0,0 +1,103 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.AI; + +namespace Microsoft.Agents.AI; + +/// +/// Strategy for splitting a conversation into query and response halves for evaluation. +/// +/// +/// Use one of the built-in splitters from or implement +/// your own for domain-specific splitting logic (e.g., splitting before a memory-retrieval +/// tool call to evaluate recall quality). +/// +public interface IConversationSplitter +{ + /// + /// Splits a conversation into query messages and response messages. + /// + /// The full conversation to split. + /// A tuple of (query messages, response messages). + (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split( + IReadOnlyList conversation); +} + +/// +/// Built-in conversation splitters for common evaluation patterns. +/// +/// +/// +/// : Evaluates whether the agent answered the latest question well. +/// : Evaluates whether the whole conversation trajectory served the original request. +/// +/// For custom splits, implement directly. +/// +public static class ConversationSplitters +{ + /// + /// Split at the last user message. Everything up to and including that message + /// is the query; everything after is the response. This is the default strategy. + /// + public static IConversationSplitter LastTurn { get; } = new LastTurnSplitter(); + + /// + /// The first user message (and any preceding system messages) is the query; + /// the entire remainder of the conversation is the response. + /// Evaluates overall conversation trajectory. + /// + public static IConversationSplitter Full { get; } = new FullSplitter(); + + private sealed class LastTurnSplitter : IConversationSplitter + { + public (IReadOnlyList, IReadOnlyList) Split( + IReadOnlyList conversation) + { + int lastUserIdx = -1; + for (int i = 0; i < conversation.Count; i++) + { + if (conversation[i].Role == ChatRole.User) + { + lastUserIdx = i; + } + } + + if (lastUserIdx >= 0) + { + return ( + conversation.Take(lastUserIdx + 1).ToList(), + conversation.Skip(lastUserIdx + 1).ToList()); + } + + return (new List(), conversation.ToList()); + } + } + + private sealed class FullSplitter : IConversationSplitter + { + public (IReadOnlyList, IReadOnlyList) Split( + IReadOnlyList conversation) + { + int firstUserIdx = -1; + for (int i = 0; i < conversation.Count; i++) + { + if (conversation[i].Role == ChatRole.User) + { + firstUserIdx = i; + break; + } + } + + if (firstUserIdx >= 0) + { + return ( + conversation.Take(firstUserIdx + 1).ToList(), + conversation.Skip(firstUserIdx + 1).ToList()); + } + + return (new List(), conversation.ToList()); + } + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/LocalEvaluator.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/LocalEvaluator.cs new file mode 100644 index 0000000000..2b664b0e3b --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/LocalEvaluator.cs @@ -0,0 +1,66 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI; + +/// +/// Evaluator that runs check functions locally without API calls. +/// +public sealed class LocalEvaluator : IAgentEvaluator +{ + private readonly EvalCheck[] _checks; + + /// + /// Initializes a new instance of the class. + /// + /// The check functions to run on each item. + public LocalEvaluator(params EvalCheck[] checks) + { + this._checks = checks; + } + + /// + public string Name => "LocalEvaluator"; + + /// + public Task EvaluateAsync( + IReadOnlyList items, + string evalName = "Local Eval", + CancellationToken cancellationToken = default) + { + var results = new List(items.Count); + + foreach (var item in items) + { + cancellationToken.ThrowIfCancellationRequested(); + + var evalResult = new EvaluationResult(); + + foreach (var check in this._checks) + { + var EvalCheckResult = check(item); + evalResult.Metrics[EvalCheckResult.CheckName] = new BooleanMetric( + EvalCheckResult.CheckName, + EvalCheckResult.Passed, + reason: EvalCheckResult.Reason) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvalCheckResult.Passed + ? EvaluationRating.Good + : EvaluationRating.Unacceptable, + Failed = !EvalCheckResult.Passed, + }, + }; + } + + results.Add(evalResult); + } + + return Task.FromResult(new AgentEvaluationResults(this.Name, results, inputItems: items)); + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs new file mode 100644 index 0000000000..e2a6ea67e4 --- /dev/null +++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs @@ -0,0 +1,63 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI; + +/// +/// Adapter that wraps an MEAI into an . +/// Runs the MEAI evaluator per-item and aggregates results. +/// +internal sealed class MeaiEvaluatorAdapter : IAgentEvaluator +{ + private readonly IEvaluator _evaluator; + private readonly ChatConfiguration _chatConfiguration; + + /// + /// Initializes a new instance of the class. + /// + /// The MEAI evaluator to wrap. + /// Chat configuration for the evaluator (includes the judge model). + public MeaiEvaluatorAdapter(IEvaluator evaluator, ChatConfiguration chatConfiguration) + { + this._evaluator = evaluator; + this._chatConfiguration = chatConfiguration; + } + + /// + public string Name => this._evaluator.GetType().Name; + + /// + public async Task EvaluateAsync( + IReadOnlyList items, + string evalName = "MEAI Eval", + CancellationToken cancellationToken = default) + { + var results = new List(items.Count); + + foreach (var item in items) + { + cancellationToken.ThrowIfCancellationRequested(); + + var (queryMessages, _) = item.Split(); + var messages = queryMessages.ToList(); + var chatResponse = item.RawResponse + ?? new ChatResponse(new ChatMessage(ChatRole.Assistant, item.Response)); + + var result = await this._evaluator.EvaluateAsync( + messages, + chatResponse, + this._chatConfiguration, + cancellationToken: cancellationToken).ConfigureAwait(false); + + results.Add(result); + } + + return new AgentEvaluationResults(this.Name, results); + } +} diff --git a/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj b/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj index 70da404a61..a111ce8c2d 100644 --- a/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj +++ b/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj @@ -31,6 +31,14 @@ + + + + + + + + Microsoft Agent Framework diff --git a/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs b/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs new file mode 100644 index 0000000000..00c3519f3f --- /dev/null +++ b/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs @@ -0,0 +1,1112 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; + +namespace Microsoft.Agents.AI.UnitTests; + +/// +/// Tests for the evaluation types: , , +/// , and . +/// +public sealed class EvaluationTests +{ + private static EvalItem CreateItem( + string query = "What is the weather?", + string response = "The weather in Seattle is sunny and 72°F.", + IReadOnlyList? conversation = null) + { + conversation ??= new List + { + new(ChatRole.User, query), + new(ChatRole.Assistant, response), + }; + + return new EvalItem(query, response, conversation); + } + + // --------------------------------------------------------------- + // EvalItem tests + // --------------------------------------------------------------- + + [Fact] + public void EvalItem_Constructor_SetsProperties() + { + // Arrange & Act + var item = CreateItem(); + + // Assert + Assert.Equal("What is the weather?", item.Query); + Assert.Equal("The weather in Seattle is sunny and 72°F.", item.Response); + Assert.Equal(2, item.Conversation.Count); + Assert.Null(item.ExpectedOutput); + Assert.Null(item.Context); + Assert.Null(item.Tools); + } + + [Fact] + public void EvalItem_OptionalProperties_CanBeSet() + { + // Arrange & Act + var item = CreateItem(); + item.ExpectedOutput = "sunny"; + item.Context = "Weather data for Seattle"; + + // Assert + Assert.Equal("sunny", item.ExpectedOutput); + Assert.Equal("Weather data for Seattle", item.Context); + } + + // --------------------------------------------------------------- + // LocalEvaluator tests + // --------------------------------------------------------------- + + [Fact] + public async Task LocalEvaluator_WithPassingCheck_ReturnsPassedResultAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + FunctionEvaluator.Create("always_pass", (string _) => true)); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.Equal("LocalEvaluator", results.Provider); + Assert.Equal(1, results.Total); + Assert.Equal(1, results.Passed); + Assert.Equal(0, results.Failed); + Assert.True(results.AllPassed); + } + + [Fact] + public async Task LocalEvaluator_WithFailingCheck_ReturnsFailedResultAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + FunctionEvaluator.Create("always_fail", (string _) => false)); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.Equal(1, results.Total); + Assert.Equal(0, results.Passed); + Assert.Equal(1, results.Failed); + Assert.False(results.AllPassed); + } + + [Fact] + public async Task LocalEvaluator_WithMultipleChecks_AllChecksRunAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + FunctionEvaluator.Create("check1", (string _) => true), + FunctionEvaluator.Create("check2", (string _) => true)); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.Equal(1, results.Total); + Assert.True(results.AllPassed); + var itemResult = results.Items[0]; + Assert.Equal(2, itemResult.Metrics.Count); + Assert.True(itemResult.Metrics.ContainsKey("check1")); + Assert.True(itemResult.Metrics.ContainsKey("check2")); + } + + [Fact] + public async Task LocalEvaluator_WithMultipleItems_EvaluatesAllAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("weather")); + + var items = new List + { + CreateItem(response: "The weather is sunny."), + CreateItem(response: "I don't know about that topic."), + }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.Equal(2, results.Total); + Assert.Equal(1, results.Passed); + Assert.Equal(1, results.Failed); + } + + // --------------------------------------------------------------- + // FunctionEvaluator tests + // --------------------------------------------------------------- + + [Fact] + public async Task FunctionEvaluator_ResponseOnly_PassesResponseAsync() + { + // Arrange + var check = FunctionEvaluator.Create("length_check", + (string response) => response.Length > 10); + + var evaluator = new LocalEvaluator(check); + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task FunctionEvaluator_WithExpected_PassesExpectedAsync() + { + // Arrange + var check = FunctionEvaluator.Create("contains_expected", + (string response, string? expectedOutput) => + expectedOutput != null && response.Contains(expectedOutput, StringComparison.OrdinalIgnoreCase)); + + var evaluator = new LocalEvaluator(check); + var item = CreateItem(); + item.ExpectedOutput = "sunny"; + var items = new List { item }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task FunctionEvaluator_FullItem_AccessesAllFieldsAsync() + { + // Arrange + var check = FunctionEvaluator.Create("full_check", + (EvalItem item) => item.Query.Contains("weather", StringComparison.OrdinalIgnoreCase) + && item.Response.Length > 0); + + var evaluator = new LocalEvaluator(check); + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task FunctionEvaluator_WithCheckResult_ReturnsCustomReasonAsync() + { + // Arrange + var check = FunctionEvaluator.Create("custom_check", + (EvalItem item) => new EvalCheckResult(true, "Custom reason", "custom_check")); + + var evaluator = new LocalEvaluator(check); + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + var metric = results.Items[0].Get("custom_check"); + Assert.Equal("Custom reason", metric.Reason); + } + + // --------------------------------------------------------------- + // EvalChecks tests + // --------------------------------------------------------------- + + [Fact] + public async Task KeywordCheck_AllKeywordsPresent_PassesAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("weather", "sunny")); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task KeywordCheck_MissingKeyword_FailsAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("snow")); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.False(results.AllPassed); + } + + [Fact] + public async Task KeywordCheck_CaseInsensitiveByDefault_PassesAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("WEATHER", "SUNNY")); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task KeywordCheck_CaseSensitive_FailsOnWrongCaseAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck(caseSensitive: true, "WEATHER")); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.False(results.AllPassed); + } + + [Fact] + public async Task ToolCalledCheck_ToolPresent_PassesAsync() + { + // Arrange + var conversation = new List + { + new(ChatRole.User, "What is the weather?"), + new(ChatRole.Assistant, new List + { + new FunctionCallContent("call1", "get_weather", new Dictionary { ["city"] = "Seattle" }), + }), + new(ChatRole.Tool, new List + { + new FunctionResultContent("call1", "72°F and sunny"), + }), + new(ChatRole.Assistant, "The weather is sunny and 72°F."), + }; + + var item = CreateItem(conversation: conversation); + var evaluator = new LocalEvaluator( + EvalChecks.ToolCalledCheck("get_weather")); + + // Act + var results = await evaluator.EvaluateAsync(new List { item }); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public async Task ToolCalledCheck_ToolMissing_FailsAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.ToolCalledCheck("get_weather")); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.False(results.AllPassed); + } + + // --------------------------------------------------------------- + // AgentEvaluationResults tests + // --------------------------------------------------------------- + + [Fact] + public void AgentEvaluationResults_AllPassed_WhenAllMetricsGood() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["check"] = new BooleanMetric("check", true) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Good, + Failed = false, + }, + }; + + // Act + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Assert + Assert.True(results.AllPassed); + Assert.Equal(1, results.Passed); + Assert.Equal(0, results.Failed); + } + + [Fact] + public void AgentEvaluationResults_NotAllPassed_WhenMetricFailed() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["check"] = new BooleanMetric("check", false) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Unacceptable, + Failed = true, + }, + }; + + // Act + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Assert + Assert.False(results.AllPassed); + Assert.Equal(0, results.Passed); + Assert.Equal(1, results.Failed); + } + + [Fact] + public void AssertAllPassed_ThrowsOnFailure() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["check"] = new BooleanMetric("check", false) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Unacceptable, + Failed = true, + }, + }; + + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Act & Assert + var ex = Assert.Throws(() => results.AssertAllPassed()); + Assert.Contains("0 passed", ex.Message); + Assert.Contains("1 failed", ex.Message); + } + + [Fact] + public void AssertAllPassed_DoesNotThrowOnSuccess() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["check"] = new BooleanMetric("check", true) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Good, + Failed = false, + }, + }; + + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Act & Assert (no exception) + results.AssertAllPassed(); + } + + [Fact] + public void AgentEvaluationResults_NumericMetric_HighScorePasses() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["relevance"] = new NumericMetric("relevance", 4.5); + + // Act + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Assert + Assert.True(results.AllPassed); + } + + [Fact] + public void AgentEvaluationResults_NumericMetric_LowScoreFails() + { + // Arrange + var evalResult = new EvaluationResult(); + evalResult.Metrics["relevance"] = new NumericMetric("relevance", 2.0); + + // Act + var results = new AgentEvaluationResults("test", new[] { evalResult }); + + // Assert + Assert.False(results.AllPassed); + } + + [Fact] + public void AgentEvaluationResults_SubResults_AllPassedChecksChildren() + { + // Arrange + var passResult = new EvaluationResult(); + passResult.Metrics["check"] = new BooleanMetric("check", true) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Good, + Failed = false, + }, + }; + + var failResult = new EvaluationResult(); + failResult.Metrics["check"] = new BooleanMetric("check", false) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Unacceptable, + Failed = true, + }, + }; + + var results = new AgentEvaluationResults("test", Array.Empty()) + { + SubResults = new Dictionary + { + ["agent1"] = new("test", new[] { passResult }), + ["agent2"] = new("test", new[] { failResult }), + }, + }; + + // Assert + Assert.False(results.AllPassed); + } + + // --------------------------------------------------------------- + // Mixed evaluator tests + // --------------------------------------------------------------- + + [Fact] + public async Task LocalEvaluator_MixedChecks_ReportsCorrectCountsAsync() + { + // Arrange + var evaluator = new LocalEvaluator( + EvalChecks.KeywordCheck("weather"), + EvalChecks.KeywordCheck("snow"), + FunctionEvaluator.Create("is_long", (string r) => r.Length > 5)); + + var items = new List { CreateItem() }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert + Assert.Equal(1, results.Total); + + // One item with 3 checks: "weather" passes, "snow" fails, "is_long" passes + // The item has one failed metric so it should count as failed + Assert.Equal(0, results.Passed); + Assert.Equal(1, results.Failed); + } + + // --------------------------------------------------------------- + // Conversation Split tests + // --------------------------------------------------------------- + + private static List CreateMultiTurnConversation() + { + return new List + { + new(ChatRole.User, "What's the weather in Seattle?"), + new(ChatRole.Assistant, "Seattle is 62°F and cloudy."), + new(ChatRole.User, "And Paris?"), + new(ChatRole.Assistant, "Paris is 68°F and partly sunny."), + new(ChatRole.User, "Compare them."), + new(ChatRole.Assistant, "Seattle is cooler; Paris is warmer and sunnier."), + }; + } + + [Fact] + public void Split_LastTurn_SplitsAtLastUserMessage() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + var item = new EvalItem("Compare them.", "Seattle is cooler; Paris is warmer and sunnier.", conversation); + + // Act + var (query, response) = item.Split(ConversationSplitters.LastTurn); + + // Assert — query includes everything up to and including "Compare them." + Assert.Equal(5, query.Count); + Assert.Equal(ChatRole.User, query[query.Count - 1].Role); + Assert.Contains("Compare", query[query.Count - 1].Text); + + // Response is the final assistant message + Assert.Single(response); + Assert.Equal(ChatRole.Assistant, response[0].Role); + } + + [Fact] + public void Split_Full_SplitsAtFirstUserMessage() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + var item = new EvalItem("What's the weather in Seattle?", "Full trajectory", conversation); + + // Act + var (query, response) = item.Split(ConversationSplitters.Full); + + // Assert — query is just the first user message + Assert.Single(query); + Assert.Contains("Seattle", query[0].Text); + + // Response is everything after + Assert.Equal(5, response.Count); + } + + [Fact] + public void Split_Full_IncludesSystemMessagesInQuery() + { + // Arrange + var conversation = new List + { + new(ChatRole.System, "You are a weather assistant."), + new(ChatRole.User, "What's the weather?"), + new(ChatRole.Assistant, "It's sunny."), + }; + + var item = new EvalItem("What's the weather?", "It's sunny.", conversation); + + // Act + var (query, response) = item.Split(ConversationSplitters.Full); + + // Assert — system message + first user message + Assert.Equal(2, query.Count); + Assert.Equal(ChatRole.System, query[0].Role); + Assert.Equal(ChatRole.User, query[1].Role); + Assert.Single(response); + } + + [Fact] + public void Split_DefaultIsLastTurn() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + var item = new EvalItem("Compare them.", "response", conversation); + + // Act — no split specified + var (query, response) = item.Split(); + + // Assert — same as LastTurn + Assert.Equal(5, query.Count); + Assert.Single(response); + } + + [Fact] + public void Split_SplitterProperty_UsedWhenNoExplicitSplit() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + var item = new EvalItem("query", "response", conversation) + { + Splitter = ConversationSplitters.Full, + }; + + // Act — no explicit split, should use Splitter + var (query, response) = item.Split(); + + // Assert — Full split + Assert.Single(query); + Assert.Equal(5, response.Count); + } + + [Fact] + public void Split_ExplicitSplitter_OverridesSplitterProperty() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + var item = new EvalItem("query", "response", conversation) + { + Splitter = ConversationSplitters.Full, + }; + + // Act — explicit LastTurn overrides Full + var (query, response) = item.Split(ConversationSplitters.LastTurn); + + // Assert — LastTurn behavior + Assert.Equal(5, query.Count); + Assert.Single(response); + } + + [Fact] + public void Split_WithToolMessages_PreservesToolPairs() + { + // Arrange + var conversation = new List + { + new(ChatRole.User, "What's the weather?"), + new(ChatRole.Assistant, new List + { + new FunctionCallContent("c1", "get_weather", new Dictionary { ["city"] = "Seattle" }), + }), + new(ChatRole.Tool, new List + { + new FunctionResultContent("c1", "62°F, cloudy"), + }), + new(ChatRole.Assistant, "Seattle is 62°F and cloudy."), + new(ChatRole.User, "Thanks!"), + new(ChatRole.Assistant, "You're welcome!"), + }; + + var item = new EvalItem("Thanks!", "You're welcome!", conversation); + + // Act + var (query, response) = item.Split(ConversationSplitters.LastTurn); + + // Assert — tool messages stay in query context + Assert.Equal(5, query.Count); + Assert.Equal(ChatRole.Tool, query[2].Role); + Assert.Single(response); + } + + [Fact] + public void ConversationSplitters_LastTurn_CanBeUsedAsCustomFallback() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + + // Act — use ConversationSplitters.LastTurn directly + var (query, response) = ConversationSplitters.LastTurn.Split(conversation); + + // Assert + Assert.Equal(5, query.Count); + Assert.Single(response); + } + + // --------------------------------------------------------------- + // PerTurnItems tests + // --------------------------------------------------------------- + + [Fact] + public void PerTurnItems_SplitsMultiTurnConversation() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + + // Act + var items = EvalItem.PerTurnItems(conversation); + + // Assert — 3 user messages = 3 items + Assert.Equal(3, items.Count); + + // First turn: "What's the weather in Seattle?" + Assert.Contains("Seattle", items[0].Query); + Assert.Contains("62°F", items[0].Response); + Assert.Equal(2, items[0].Conversation.Count); + + // Second turn: "And Paris?" + Assert.Contains("Paris", items[1].Query); + Assert.Contains("68°F", items[1].Response); + Assert.Equal(4, items[1].Conversation.Count); + + // Third turn: "Compare them." + Assert.Contains("Compare", items[2].Query); + Assert.Contains("cooler", items[2].Response); + Assert.Equal(6, items[2].Conversation.Count); + } + + [Fact] + public void PerTurnItems_PropagatesToolsAndContext() + { + // Arrange + var conversation = CreateMultiTurnConversation(); + + // Act + var items = EvalItem.PerTurnItems( + conversation, + context: "Weather database"); + + // Assert + Assert.All(items, item => Assert.Equal("Weather database", item.Context)); + } + + [Fact] + public void PerTurnItems_SingleTurn_ReturnsOneItem() + { + // Arrange + var conversation = new List + { + new(ChatRole.User, "Hello"), + new(ChatRole.Assistant, "Hi there!"), + }; + + // Act + var items = EvalItem.PerTurnItems(conversation); + + // Assert + Assert.Single(items); + Assert.Equal("Hello", items[0].Query); + Assert.Equal("Hi there!", items[0].Response); + } + + // --------------------------------------------------------------- + // Custom IConversationSplitter tests + // --------------------------------------------------------------- + + [Fact] + public void Split_CustomSplitter_IsUsed() + { + // Arrange — splitter that splits before a tool call message + var conversation = new List + { + new(ChatRole.User, "Remember this"), + new(ChatRole.Assistant, "Storing..."), + new(ChatRole.User, "What did I say?"), + new(ChatRole.Assistant, new List + { + new FunctionCallContent("c1", "retrieve_memory"), + }), + new(ChatRole.Tool, new List + { + new FunctionResultContent("c1", "You said: Remember this"), + }), + new(ChatRole.Assistant, "You said 'Remember this'."), + }; + + var splitter = new MemorySplitter(); + var item = new EvalItem("What did I say?", "You said 'Remember this'.", conversation); + + // Act + var (query, response) = item.Split(splitter); + + // Assert — split before the tool call + Assert.Equal(3, query.Count); + Assert.Equal(3, response.Count); + } + + [Fact] + public void Split_CustomSplitter_WorksAsItemProperty() + { + // Arrange — custom splitter set on the item (simulating call-site override) + var conversation = new List + { + new(ChatRole.User, "Remember this"), + new(ChatRole.Assistant, "Storing..."), + new(ChatRole.User, "What did I say?"), + new(ChatRole.Assistant, new List + { + new FunctionCallContent("c1", "retrieve_memory"), + }), + new(ChatRole.Tool, new List + { + new FunctionResultContent("c1", "You said: Remember this"), + }), + new(ChatRole.Assistant, "You said 'Remember this'."), + }; + + var item = new EvalItem("What did I say?", "You said 'Remember this'.", conversation) + { + Splitter = new MemorySplitter(), + }; + + // Act — no explicit splitter, uses item.Splitter + var (query, response) = item.Split(); + + // Assert — custom splitter was used + Assert.Equal(3, query.Count); + Assert.Equal(3, response.Count); + } + + private sealed class MemorySplitter : IConversationSplitter + { + public (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split( + IReadOnlyList conversation) + { + for (int i = 0; i < conversation.Count; i++) + { + var msg = conversation[i]; + if (msg.Role == ChatRole.Assistant && msg.Contents != null) + { + foreach (var content in msg.Contents) + { + if (content is FunctionCallContent fc && fc.Name == "retrieve_memory") + { + return ( + conversation.Take(i).ToList(), + conversation.Skip(i).ToList()); + } + } + } + } + + // Fallback to last-turn split + return ConversationSplitters.LastTurn.Split(conversation); + } + } + + // --------------------------------------------------------------- + // ExpectedToolCall tests + // --------------------------------------------------------------- + + [Fact] + public void ExpectedToolCall_NameOnly() + { + var tc = new ExpectedToolCall("get_weather"); + Assert.Equal("get_weather", tc.Name); + Assert.Null(tc.Arguments); + } + + [Fact] + public void ExpectedToolCall_NameAndArgs() + { + var args = new Dictionary { ["location"] = "NYC" }; + var tc = new ExpectedToolCall("get_weather", args); + Assert.Equal("get_weather", tc.Name); + Assert.NotNull(tc.Arguments); + Assert.Equal("NYC", tc.Arguments["location"]); + } + + [Fact] + public void EvalItem_ExpectedToolCalls_DefaultNull() + { + var item = CreateItem(); + Assert.Null(item.ExpectedToolCalls); + } + + [Fact] + public void EvalItem_ExpectedToolCalls_CanBeSet() + { + var item = CreateItem(); + item.ExpectedToolCalls = new List + { + new("get_weather", new Dictionary { ["location"] = "NYC" }), + new("book_flight"), + }; + + Assert.NotNull(item.ExpectedToolCalls); + Assert.Equal(2, item.ExpectedToolCalls.Count); + Assert.Equal("get_weather", item.ExpectedToolCalls[0].Name); + Assert.Null(item.ExpectedToolCalls[1].Arguments); + } + + [Fact] + public async Task LocalEvaluator_PopulatesInputItems_ForAuditingAsync() + { + // Arrange + var check = FunctionEvaluator.Create("is_sunny", + (string response) => response.Contains("sunny", StringComparison.OrdinalIgnoreCase)); + + var evaluator = new LocalEvaluator(check); + var items = new List + { + CreateItem(query: "Weather?", response: "It's sunny!"), + CreateItem(query: "Temp?", response: "72 degrees"), + }; + + // Act + var results = await evaluator.EvaluateAsync(items); + + // Assert — InputItems carries the original query/response for auditing + Assert.NotNull(results.InputItems); + Assert.Equal(2, results.InputItems.Count); + Assert.Equal("Weather?", results.InputItems[0].Query); + Assert.Equal("It's sunny!", results.InputItems[0].Response); + Assert.Equal("Temp?", results.InputItems[1].Query); + Assert.Equal("72 degrees", results.InputItems[1].Response); + + // Results and InputItems are positionally correlated + Assert.Equal(results.Items.Count, results.InputItems.Count); + } + + // --------------------------------------------------------------- + // AgentEvaluationResults tests + // --------------------------------------------------------------- + + [Fact] + public void AllPassed_EmptyItems_NoSubResults_ReturnsFalseAsync() + { + var results = new AgentEvaluationResults("test", Array.Empty()); + Assert.False(results.AllPassed); + Assert.Equal(0, results.Total); + } + + [Fact] + public void AllPassed_SubResultsAllPass_OverallFails_ReturnsFalseAsync() + { + // Overall has a failing item + var failMetric = new BooleanMetric("check", false) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Unacceptable, + Failed = true, + }, + }; + var failResult = new EvaluationResult(); + failResult.Metrics["check"] = failMetric; + + var overall = new AgentEvaluationResults("test", new[] { failResult }); + + // Sub-results all pass + var passMetric = new BooleanMetric("check", true) + { + Interpretation = new EvaluationMetricInterpretation + { + Rating = EvaluationRating.Good, + Failed = false, + }, + }; + var passResult = new EvaluationResult(); + passResult.Metrics["check"] = passMetric; + + overall.SubResults = new Dictionary + { + ["agent1"] = new AgentEvaluationResults("sub", new[] { passResult }), + }; + + // Overall has a failing item, so AllPassed should be false + Assert.False(overall.AllPassed); + } + + // --------------------------------------------------------------- + // BuildItemsFromResponses validation tests + // --------------------------------------------------------------- + + [Fact] + public void BuildEvalItem_SetsPropertiesCorrectly() + { + var userMsg = new ChatMessage(ChatRole.User, "test query"); + var assistantMsg = new ChatMessage(ChatRole.Assistant, "response"); + var inputMessages = new List { userMsg }; + var response = new AgentResponse(assistantMsg); + + var item = AgentEvaluationExtensions.BuildEvalItem("test query", response, inputMessages, null!); + + Assert.Equal("test query", item.Query); + Assert.NotNull(item.RawResponse); + } + + [Fact] + public void BuildEvalItem_DoesNotMutateInputMessages() + { + // Arrange + var userMsg = new ChatMessage(ChatRole.User, "hello"); + var assistantMsg = new ChatMessage(ChatRole.Assistant, "world"); + var inputMessages = new List { userMsg }; + var response = new AgentResponse(assistantMsg); + + // Act + var item = AgentEvaluationExtensions.BuildEvalItem("hello", response, inputMessages, null!); + + // Assert — input list is not mutated + Assert.Single(inputMessages); + Assert.Equal(userMsg, inputMessages[0]); + + // But the EvalItem's conversation includes the response message + Assert.Equal(2, item.Conversation.Count); + } + + // --------------------------------------------------------------- + // BuildItemsFromResponses validation tests + // --------------------------------------------------------------- + + [Fact] + public void BuildItemsFromResponses_MismatchedQueryAndResponseCount_Throws() + { + var queries = new[] { "q1", "q2" }; + var responses = new[] { new AgentResponse(new ChatMessage(ChatRole.Assistant, "a1")) }; + + var ex = Assert.Throws( + () => AgentEvaluationExtensions.BuildItemsFromResponses(null!, responses, queries, null, null)); + Assert.Contains("queries", ex.Message); + Assert.Contains("responses", ex.Message); + } + + [Fact] + public void BuildItemsFromResponses_MismatchedExpectedOutput_Throws() + { + var queries = new[] { "q1" }; + var responses = new[] { new AgentResponse(new ChatMessage(ChatRole.Assistant, "a1")) }; + var expectedOutput = new[] { "e1", "e2" }; + + var ex = Assert.Throws( + () => AgentEvaluationExtensions.BuildItemsFromResponses(null!, responses, queries, expectedOutput, null)); + Assert.Contains("expectedOutput", ex.Message); + } + + [Fact] + public void BuildItemsFromResponses_MismatchedExpectedToolCalls_Throws() + { + var queries = new[] { "q1" }; + var responses = new[] { new AgentResponse(new ChatMessage(ChatRole.Assistant, "a1")) }; + var expectedToolCalls = new[] { new[] { new ExpectedToolCall("t1") }, new[] { new ExpectedToolCall("t2") } }; + + var ex = Assert.Throws( + () => AgentEvaluationExtensions.BuildItemsFromResponses( + null!, responses, queries, null, expectedToolCalls)); + Assert.Contains("expectedToolCalls", ex.Message); + } + + // --------------------------------------------------------------- + // FoundryEvals.BuildEvaluators tests + // --------------------------------------------------------------- + + [Fact] + public void BuildEvaluators_QualityNames_ReturnsDistinctEvaluators() + { + var evaluators = AzureAI.FoundryEvals.BuildEvaluators( + new[] { AzureAI.FoundryEvals.Relevance, AzureAI.FoundryEvals.Coherence }); + + Assert.Equal(2, evaluators.Count); + } + + [Fact] + public void BuildEvaluators_MultipleSafetyNames_SingleContentHarmEvaluator() + { + var evaluators = AzureAI.FoundryEvals.BuildEvaluators( + new[] + { + AzureAI.FoundryEvals.Violence, + AzureAI.FoundryEvals.Sexual, + AzureAI.FoundryEvals.SelfHarm, + AzureAI.FoundryEvals.HateUnfairness, + }); + + // All four safety names produce exactly one ContentHarmEvaluator + Assert.Single(evaluators); + } + + [Fact] + public void BuildEvaluators_UnknownName_ThrowsArgumentException() + { + var names = new[] { "gobblygook" }; + var ex = Assert.Throws( + () => AzureAI.FoundryEvals.BuildEvaluators(names)); + Assert.Contains("gobblygook", ex.Message); + Assert.Contains("not supported", ex.Message, StringComparison.OrdinalIgnoreCase); + } + + [Fact] + public void BuildEvaluators_DefaultSelection_ReturnsRelevanceAndCoherence() + { + // Default evaluator names when constructor receives empty array + var defaults = new[] { AzureAI.FoundryEvals.Relevance, AzureAI.FoundryEvals.Coherence }; + var evaluators = AzureAI.FoundryEvals.BuildEvaluators(defaults); + + Assert.Equal(2, evaluators.Count); + } +} diff --git a/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj b/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj index ffa4417f34..8e1dba18bd 100644 --- a/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj +++ b/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj @@ -13,6 +13,16 @@ + + + + + + + + + + diff --git a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj index 58979a4f1b..6adedab6c3 100644 --- a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj +++ b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj @@ -4,6 +4,11 @@ $(NoWarn);MEAI001 + + + + + diff --git a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs new file mode 100644 index 0000000000..1ab7e71a82 --- /dev/null +++ b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs @@ -0,0 +1,156 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections.Generic; + +namespace Microsoft.Agents.AI.Workflows.UnitTests; + +/// +/// Tests for . +/// +public sealed class WorkflowEvaluationTests +{ + [Fact] + public void ExtractAgentData_EmptyEvents_ReturnsEmpty() + { + var result = WorkflowEvaluationExtensions.ExtractAgentData(new List(), splitter: null); + + Assert.Empty(result); + } + + [Fact] + public void ExtractAgentData_MatchedPair_ReturnsItem() + { + var events = new List + { + new ExecutorInvokedEvent("agent-1", "What is the weather?"), + new ExecutorCompletedEvent("agent-1", "It's sunny."), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Single(result); + Assert.True(result.ContainsKey("agent-1")); + Assert.Single(result["agent-1"]); + Assert.Equal("What is the weather?", result["agent-1"][0].Query); + Assert.Equal("It's sunny.", result["agent-1"][0].Response); + Assert.Equal(2, result["agent-1"][0].Conversation.Count); + } + + [Fact] + public void ExtractAgentData_UnmatchedInvocation_NotIncluded() + { + // An invocation without a matching completion should not appear in results + var events = new List + { + new ExecutorInvokedEvent("agent-1", "Hello"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Empty(result); + } + + [Fact] + public void ExtractAgentData_CompletionWithoutInvocation_NotIncluded() + { + // A completion without a prior invocation should not appear in results + var events = new List + { + new ExecutorCompletedEvent("agent-1", "Response"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Empty(result); + } + + [Fact] + public void ExtractAgentData_MultipleAgents_SeparatedByExecutorId() + { + var events = new List + { + new ExecutorInvokedEvent("agent-1", "Q1"), + new ExecutorInvokedEvent("agent-2", "Q2"), + new ExecutorCompletedEvent("agent-1", "A1"), + new ExecutorCompletedEvent("agent-2", "A2"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Equal(2, result.Count); + Assert.Equal("Q1", result["agent-1"][0].Query); + Assert.Equal("A1", result["agent-1"][0].Response); + Assert.Equal("Q2", result["agent-2"][0].Query); + Assert.Equal("A2", result["agent-2"][0].Response); + } + + [Fact] + public void ExtractAgentData_DuplicateExecutorId_LastInvocationUsed() + { + // If the same executor is invoked twice before completing, + // the second invocation overwrites the first + var events = new List + { + new ExecutorInvokedEvent("agent-1", "First question"), + new ExecutorInvokedEvent("agent-1", "Second question"), + new ExecutorCompletedEvent("agent-1", "Answer"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Single(result); + Assert.Single(result["agent-1"]); + Assert.Equal("Second question", result["agent-1"][0].Query); + } + + [Fact] + public void ExtractAgentData_MultipleRoundsForSameExecutor_AllCaptured() + { + // Same executor invoked→completed twice (sequential rounds) + var events = new List + { + new ExecutorInvokedEvent("agent-1", "Q1"), + new ExecutorCompletedEvent("agent-1", "A1"), + new ExecutorInvokedEvent("agent-1", "Q2"), + new ExecutorCompletedEvent("agent-1", "A2"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Single(result); // one executor + Assert.Equal(2, result["agent-1"].Count); // two items + Assert.Equal("Q1", result["agent-1"][0].Query); + Assert.Equal("Q2", result["agent-1"][1].Query); + } + + [Fact] + public void ExtractAgentData_NullData_UsesEmptyString() + { + var events = new List + { + new ExecutorInvokedEvent("agent-1", null!), + new ExecutorCompletedEvent("agent-1", null), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null); + + Assert.Single(result); + Assert.Equal(string.Empty, result["agent-1"][0].Query); + Assert.Equal(string.Empty, result["agent-1"][0].Response); + } + + [Fact] + public void ExtractAgentData_WithSplitter_SetOnItems() + { + var splitter = ConversationSplitters.LastTurn; + var events = new List + { + new ExecutorInvokedEvent("agent-1", "Q"), + new ExecutorCompletedEvent("agent-1", "A"), + }; + + var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter); + + Assert.Equal(splitter, result["agent-1"][0].Splitter); + } +}