diff --git a/dotnet/agent-framework-dotnet.slnx b/dotnet/agent-framework-dotnet.slnx
index a4ffe13958..1c47bfe2b5 100644
--- a/dotnet/agent-framework-dotnet.slnx
+++ b/dotnet/agent-framework-dotnet.slnx
@@ -147,6 +147,7 @@
+
diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj
index 646cd75532..8b6a7d5001 100644
--- a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj
+++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/FoundryAgents_Evaluations_Step02_SelfReflection.csproj
@@ -9,7 +9,6 @@
-
diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs
index 8f8c9fa4ee..9f7ad4be3a 100644
--- a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs
+++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step02_SelfReflection/Program.cs
@@ -12,7 +12,6 @@
// For more details, see:
// https://learn.microsoft.com/dotnet/ai/evaluation/libraries
-using Azure.AI.OpenAI;
using Azure.AI.Projects;
using Azure.Identity;
using Microsoft.Agents.AI;
@@ -24,26 +23,25 @@
using ChatMessage = Microsoft.Extensions.AI.ChatMessage;
using ChatRole = Microsoft.Extensions.AI.ChatRole;
-string endpoint = Environment.GetEnvironmentVariable("AZURE_AI_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_AI_PROJECT_ENDPOINT is not set.");
-string deploymentName = Environment.GetEnvironmentVariable("AZURE_AI_MODEL_DEPLOYMENT_NAME") ?? "gpt-4o-mini";
-string openAiEndpoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT") ?? throw new InvalidOperationException("AZURE_OPENAI_ENDPOINT is not set.");
-string evaluatorDeploymentName = Environment.GetEnvironmentVariable("AZURE_OPENAI_DEPLOYMENT_NAME") ?? deploymentName;
+string endpoint = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_FOUNDRY_PROJECT_ENDPOINT is not set.");
+string deploymentName = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME") ?? "gpt-4o-mini";
Console.WriteLine("=" + new string('=', 79));
Console.WriteLine("SELF-REFLECTION EVALUATION SAMPLE");
Console.WriteLine("=" + new string('=', 79));
Console.WriteLine();
-// Initialize Azure credentials and client
+// Initialize Azure credentials and client — everything derives from the project endpoint
// WARNING: DefaultAzureCredential is convenient for development but requires careful consideration in production.
// In production, consider using a specific credential (e.g., ManagedIdentityCredential) to avoid
// latency issues, unintended credential probing, and potential security risks from fallback mechanisms.
DefaultAzureCredential credential = new();
AIProjectClient aiProjectClient = new(new Uri(endpoint), credential);
-// Set up the LLM-based chat client for quality evaluators
-IChatClient chatClient = new AzureOpenAIClient(new Uri(openAiEndpoint), credential)
- .GetChatClient(evaluatorDeploymentName)
+// Get a chat client for LLM-based evaluators from the project client
+IChatClient chatClient = aiProjectClient
+ .GetProjectOpenAIClient()
+ .GetChatClient(deploymentName)
.AsIChatClient();
// Configure evaluation: quality evaluators use the LLM, safety evaluators use Azure AI Foundry
@@ -55,7 +53,8 @@
originalChatConfiguration: new ChatConfiguration(chatClient));
// Create a test agent
-AIAgent agent = await aiProjectClient.CreateAIAgentAsync(
+AIAgent? agent = null;
+agent = await aiProjectClient.CreateAIAgentAsync(
name: "KnowledgeAgent",
model: deploymentName,
instructions: "You are a helpful assistant. Answer questions accurately based on the provided context.");
@@ -93,9 +92,12 @@ 7. Enterprise-grade compliance and governance features
finally
{
// Cleanup
- await aiProjectClient.Agents.DeleteAgentAsync(agent.Name);
- Console.WriteLine();
- Console.WriteLine("Cleanup: Agent deleted.");
+ if (agent is not null)
+ {
+ await aiProjectClient.Agents.DeleteAgentAsync(agent.Name);
+ Console.WriteLine();
+ Console.WriteLine("Cleanup: Agent deleted.");
+ }
}
// ============================================================================
diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj
new file mode 100644
index 0000000000..8b6a7d5001
--- /dev/null
+++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/FoundryAgents_Evaluations_Step03_AllPatterns.csproj
@@ -0,0 +1,24 @@
+
+
+
+ Exe
+ net10.0
+
+ enable
+ enable
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs
new file mode 100644
index 0000000000..4f5ea0c706
--- /dev/null
+++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/Program.cs
@@ -0,0 +1,339 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+// This sample demonstrates all evaluation patterns available in Agent Framework for .NET.
+// It covers:
+// 1. Function evaluators — custom checks using lambdas
+// 2. Built-in checks — keyword and tool-called validation
+// 3. MEAI evaluators — LLM-based quality scoring (Relevance, Coherence, Groundedness)
+// 4. Foundry evaluators — cloud-based evaluation with Azure AI Foundry
+// 5. Mixed evaluators — combining local checks with cloud evaluation
+// 6. Pre-existing response evaluation — evaluate responses without re-running the agent
+// 7. Conversation split strategies — LastTurn, Full, PerTurn, and call-site override
+//
+// Mirrors the Python sample: evaluate_all_patterns_sample.py
+
+using Azure.AI.Projects;
+using Azure.Identity;
+using Microsoft.Agents.AI;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.AI.Evaluation;
+using Microsoft.Extensions.AI.Evaluation.Quality;
+using Microsoft.Extensions.AI.Evaluation.Safety;
+
+using ChatMessage = Microsoft.Extensions.AI.ChatMessage;
+using ChatRole = Microsoft.Extensions.AI.ChatRole;
+using FoundryEvals = Microsoft.Agents.AI.AzureAI.FoundryEvals;
+
+string endpoint = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_ENDPOINT")
+ ?? throw new InvalidOperationException("AZURE_FOUNDRY_PROJECT_ENDPOINT is not set.");
+string deploymentName = Environment.GetEnvironmentVariable("AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME") ?? "gpt-4o-mini";
+
+Console.WriteLine("=" + new string('=', 79));
+Console.WriteLine("AGENT FRAMEWORK EVALUATION — ALL PATTERNS");
+Console.WriteLine("=" + new string('=', 79));
+Console.WriteLine();
+
+// Initialize Azure credentials and clients — everything derives from the project endpoint
+DefaultAzureCredential credential = new();
+AIProjectClient aiProjectClient = new(new Uri(endpoint), credential);
+
+// Get a chat client for LLM-based evaluators from the project client
+IChatClient chatClient = aiProjectClient
+ .GetProjectOpenAIClient()
+ .GetChatClient(deploymentName)
+ .AsIChatClient();
+
+ContentSafetyServiceConfiguration safetyConfig = new(
+ credential: credential,
+ endpoint: new Uri(endpoint));
+
+ChatConfiguration chatConfiguration = safetyConfig.ToChatConfiguration(
+ originalChatConfiguration: new ChatConfiguration(chatClient));
+
+// Create test agent
+AIAgent? agent = null;
+agent = await aiProjectClient.CreateAIAgentAsync(
+ name: "WeatherAgent",
+ model: deploymentName,
+ instructions: "You are a helpful weather assistant. Answer questions about weather accurately and concisely.");
+
+Console.WriteLine($"Created agent: {agent.Name}");
+Console.WriteLine();
+
+string[] queries = ["What's the weather in Seattle?", "Is it going to rain in New York today?"];
+
+try
+{
+ // ================================================================
+ // Section 1: Function Evaluators
+ // ================================================================
+ Console.WriteLine("SECTION 1: Function Evaluators");
+ Console.WriteLine(new string('-', 60));
+
+ var functionEvaluator = new LocalEvaluator(
+ FunctionEvaluator.Create("is_concise",
+ (string response) => response.Split(' ').Length < 500),
+ FunctionEvaluator.Create("has_content",
+ (string response) => response.Length > 10),
+ FunctionEvaluator.Create("mentions_location",
+ (EvalItem item) => item.Response.Contains("Seattle", StringComparison.OrdinalIgnoreCase)
+ || item.Response.Contains("New York", StringComparison.OrdinalIgnoreCase)));
+
+ AgentEvaluationResults functionResults = await agent.EvaluateAsync(
+ queries,
+ functionEvaluator);
+
+ PrintResults("Function Evaluators", functionResults);
+
+ // ================================================================
+ // Section 2: Built-in Checks
+ // ================================================================
+ Console.WriteLine("SECTION 2: Built-in Checks");
+ Console.WriteLine(new string('-', 60));
+
+ var builtinEvaluator = new LocalEvaluator(
+ EvalChecks.KeywordCheck("weather"),
+ EvalChecks.KeywordCheck(caseSensitive: false, "temperature", "forecast"));
+
+ AgentEvaluationResults builtinResults = await agent.EvaluateAsync(
+ queries,
+ builtinEvaluator);
+
+ PrintResults("Built-in Checks", builtinResults);
+
+ // ================================================================
+ // Section 3: MEAI Quality Evaluators
+ // ================================================================
+ Console.WriteLine("SECTION 3: MEAI Quality Evaluators");
+ Console.WriteLine(new string('-', 60));
+
+ // Pass MEAI evaluators directly — no adapter needed
+ AgentEvaluationResults meaiResults = await agent.EvaluateAsync(
+ queries,
+ new CompositeEvaluator(
+ new RelevanceEvaluator(),
+ new CoherenceEvaluator()),
+ chatConfiguration);
+
+ PrintResults("MEAI Quality", meaiResults);
+
+ // Print per-metric details for MEAI results
+ foreach (EvaluationResult itemResult in meaiResults.Items)
+ {
+ foreach (EvaluationMetric metric in itemResult.Metrics.Values)
+ {
+ if (metric is NumericMetric n)
+ {
+ string rating = n.Interpretation?.Rating.ToString() ?? "N/A";
+ Console.WriteLine($" {n.Name,-20} Score: {n.Value:F1}/5 Rating: {rating}");
+ }
+ }
+ }
+
+ Console.WriteLine();
+
+ // ================================================================
+ // Section 4: Foundry Evaluators (Cloud-based)
+ // ================================================================
+ Console.WriteLine("SECTION 4: Foundry Evaluators");
+ Console.WriteLine(new string('-', 60));
+
+ var foundryEvaluator = new FoundryEvals(
+ chatConfiguration,
+ FoundryEvals.Relevance,
+ FoundryEvals.Coherence,
+ FoundryEvals.Groundedness);
+
+ AgentEvaluationResults foundryResults = await agent.EvaluateAsync(
+ queries,
+ foundryEvaluator);
+
+ PrintResults("Foundry Evaluators", foundryResults);
+
+ // ================================================================
+ // Section 5: Mixed Evaluators (Local + Cloud)
+ // ================================================================
+ Console.WriteLine("SECTION 5: Mixed Evaluators");
+ Console.WriteLine(new string('-', 60));
+
+ IReadOnlyList mixedResults = await agent.EvaluateAsync(
+ queries,
+ evaluators: new IAgentEvaluator[]
+ {
+ new LocalEvaluator(
+ EvalChecks.KeywordCheck("weather"),
+ FunctionEvaluator.Create("not_empty", (string r) => r.Length > 0)),
+ new FoundryEvals(chatConfiguration, FoundryEvals.Relevance),
+ });
+
+ foreach (AgentEvaluationResults result in mixedResults)
+ {
+ PrintResults($"Mixed - {result.Provider}", result);
+ }
+
+ // ================================================================
+ // Section 6: Evaluate Pre-existing Responses
+ // ================================================================
+ Console.WriteLine("SECTION 6: Evaluate Pre-existing Responses");
+ Console.WriteLine(new string('-', 60));
+
+ // Get responses first
+ var savedQueries = new List();
+ var savedResponses = new List();
+ foreach (string query in queries)
+ {
+ AgentResponse response = await agent.RunAsync(
+ new List { new(ChatRole.User, query) });
+ savedQueries.Add(query);
+ savedResponses.Add(response);
+ }
+
+ // Evaluate the saved responses without re-running the agent
+ AgentEvaluationResults preExistingResults = await agent.EvaluateAsync(
+ savedResponses,
+ savedQueries,
+ new LocalEvaluator(
+ EvalChecks.KeywordCheck("weather"),
+ FunctionEvaluator.Create("response_quality",
+ (EvalItem item) => new EvalCheckResult(
+ item.Response.Length > 20,
+ item.Response.Length > 20
+ ? "Response is detailed enough"
+ : "Response is too short",
+ "response_quality"))));
+
+ PrintResults("Pre-existing Responses", preExistingResults);
+
+ // ================================================================
+ // Section 7: Conversation Split Strategies
+ // ================================================================
+ Console.WriteLine("SECTION 7: Conversation Split Strategies");
+ Console.WriteLine(new string('-', 60));
+
+ // Build a multi-turn conversation manually
+ var multiTurnConversation = new List
+ {
+ new(ChatRole.User, "What's the weather in Seattle?"),
+ new(ChatRole.Assistant, "Seattle is 62°F, cloudy with a chance of rain."),
+ new(ChatRole.User, "And Paris?"),
+ new(ChatRole.Assistant, "Paris is 68°F, partly sunny."),
+ new(ChatRole.User, "Compare them."),
+ new(ChatRole.Assistant, "Seattle is cooler at 62°F with rain likely, while Paris is warmer at 68°F and sunnier."),
+ };
+
+ // Strategy 1: LAST_TURN (default) — evaluates the final response
+ var lastTurnItem = new EvalItem(
+ "Compare them.",
+ "Seattle is cooler at 62°F with rain likely, while Paris is warmer at 68°F and sunnier.",
+ multiTurnConversation);
+
+ var (lastQuery, lastResponse) = lastTurnItem.Split(ConversationSplitters.LastTurn);
+ Console.WriteLine($" LastTurn split: {lastQuery.Count} query msgs, {lastResponse.Count} response msgs");
+
+ // Strategy 2: FULL — evaluates the whole conversation trajectory
+ var fullItem = new EvalItem(
+ "What's the weather in Seattle?",
+ "Full conversation trajectory",
+ multiTurnConversation)
+ {
+ Splitter = ConversationSplitters.Full,
+ };
+
+ var (fullQuery, fullResponse) = fullItem.Split();
+ Console.WriteLine($" Full split: {fullQuery.Count} query msgs, {fullResponse.Count} response msgs");
+
+ // Strategy 3: PER_TURN — one eval item per user turn
+ var perTurnItems = EvalItem.PerTurnItems(multiTurnConversation);
+ Console.WriteLine($" PerTurn split: {perTurnItems.Count} items from {multiTurnConversation.Count} messages");
+
+ foreach (var turnItem in perTurnItems)
+ {
+ Console.WriteLine($" Turn: \"{turnItem.Query}\" → {turnItem.Response.Length} chars");
+ }
+
+ // Evaluate per-turn items with a local evaluator
+ var splitEvaluator = new LocalEvaluator(
+ FunctionEvaluator.Create("has_response", (string r) => r.Length > 5));
+
+ AgentEvaluationResults perTurnResults = await splitEvaluator.EvaluateAsync(
+ perTurnItems.ToList());
+
+ PrintResults("Per-Turn Evaluation", perTurnResults);
+
+ // Strategy 4: Call-site override with built-in splitter
+ AgentEvaluationResults fullSplitResults = await agent.EvaluateAsync(
+ queries,
+ new LocalEvaluator(EvalChecks.KeywordCheck("weather")),
+ splitter: ConversationSplitters.Full);
+
+ PrintResults("Call-site Full Split", fullSplitResults);
+
+ // Strategy 5: Custom splitter as call-site override
+ // Same parameter works for built-in and custom splitters
+ AgentEvaluationResults customSplitResults = await agent.EvaluateAsync(
+ queries,
+ new LocalEvaluator(EvalChecks.KeywordCheck("weather")),
+ splitter: new WeatherToolSplitter());
+
+ PrintResults("Custom Splitter Override", customSplitResults);
+ Console.WriteLine();
+}
+finally
+{
+ // Cleanup
+ if (agent is not null)
+ {
+ await aiProjectClient.Agents.DeleteAgentAsync(agent.Name);
+ Console.WriteLine("Cleanup: Agent deleted.");
+ }
+}
+
+// ============================================================================
+// Helper Functions
+// ============================================================================
+
+static void PrintResults(string title, AgentEvaluationResults results)
+{
+ string status = results.AllPassed ? "✓ ALL PASSED" : "✗ SOME FAILED";
+ Console.WriteLine($" [{title}] {status} ({results.Passed}/{results.Total})");
+
+ if (results.SubResults is not null)
+ {
+ foreach (var (agentId, sub) in results.SubResults)
+ {
+ string subStatus = sub.AllPassed ? "✓" : "✗";
+ Console.WriteLine($" {subStatus} {agentId}: {sub.Passed}/{sub.Total}");
+ }
+ }
+
+ Console.WriteLine();
+}
+
+// ============================================================================
+// Custom Splitter — demonstrates IConversationSplitter
+// ============================================================================
+
+///
+/// Example custom splitter that splits before the first tool call.
+/// Evaluates whether the agent's tool usage and final response are appropriate.
+///
+internal sealed class WeatherToolSplitter : IConversationSplitter
+{
+ public (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split(
+ IReadOnlyList conversation)
+ {
+ for (int i = 0; i < conversation.Count; i++)
+ {
+ if (conversation[i].Role == ChatRole.Assistant
+ && conversation[i].Contents.OfType().Any())
+ {
+ return (
+ conversation.Take(i).ToList(),
+ conversation.Skip(i).ToList());
+ }
+ }
+
+ // Fallback: use the default LastTurn split
+ return ConversationSplitters.LastTurn.Split(conversation);
+ }
+}
diff --git a/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md
new file mode 100644
index 0000000000..28eab9dd36
--- /dev/null
+++ b/dotnet/samples/02-agents/FoundryAgents/FoundryAgents_Evaluations_Step03_AllPatterns/README.md
@@ -0,0 +1,49 @@
+# Evaluation — All Patterns
+
+This sample demonstrates all evaluation patterns available in Agent Framework for .NET:
+
+| Section | Pattern | Description |
+|---------|---------|-------------|
+| 1 | **Function Evaluators** | Custom checks using C# lambdas via `FunctionEvaluator.Create()` |
+| 2 | **Built-in Checks** | `EvalChecks.KeywordCheck()` and `EvalChecks.ToolCalledCheck()` |
+| 3 | **MEAI Quality Evaluators** | LLM-based scoring with `RelevanceEvaluator`, `CoherenceEvaluator` |
+| 4 | **Foundry Evaluators** | Cloud-based evaluation via `FoundryEvals` |
+| 5 | **Mixed Evaluators** | Combining local checks with cloud evaluation in one call |
+| 6 | **Pre-existing Responses** | Evaluate saved responses without re-running the agent |
+
+## Prerequisites
+
+- Azure AI Foundry project with a deployed model
+- Set environment variables:
+ - `AZURE_FOUNDRY_PROJECT_ENDPOINT` — Your Azure AI Foundry project endpoint
+ - `AZURE_FOUNDRY_PROJECT_DEPLOYMENT_NAME` — Model deployment name (default: `gpt-4o-mini`)
+
+## Key Types
+
+```csharp
+// Custom function evaluators
+var check = FunctionEvaluator.Create("name", (string response) => response.Length > 10);
+
+// Built-in checks
+var keyword = EvalChecks.KeywordCheck("expected", "keywords");
+var toolCheck = EvalChecks.ToolCalledCheck("tool_name");
+
+// Local evaluator runs checks without API calls
+var local = new LocalEvaluator(check, keyword, toolCheck);
+
+// MEAI evaluators work directly — no adapter needed
+var results = await agent.EvaluateAsync(queries, new RelevanceEvaluator(), chatConfig);
+
+// Foundry evaluator uses Azure AI Foundry cloud evaluation
+var foundry = new FoundryEvals(chatConfig, FoundryEvals.Relevance, FoundryEvals.Coherence);
+
+// Evaluate an agent
+AgentEvaluationResults localResults = await agent.EvaluateAsync(queries, local);
+localResults.AssertAllPassed();
+```
+
+## Running
+
+```bash
+dotnet run --project FoundryAgents_Evaluations_Step03_AllPatterns.csproj
+```
diff --git a/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs b/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs
new file mode 100644
index 0000000000..a731af1099
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI.AzureAI/Evaluation/FoundryEvals.cs
@@ -0,0 +1,237 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.AI.Evaluation;
+using Microsoft.Extensions.AI.Evaluation.Quality;
+using Microsoft.Extensions.AI.Evaluation.Safety;
+
+namespace Microsoft.Agents.AI.AzureAI;
+
+///
+/// Azure AI Foundry evaluator provider with built-in evaluator name constants.
+///
+///
+///
+/// Combines evaluator constants (e.g., , )
+/// with the implementation that maps them to MEAI evaluators.
+///
+///
+/// When the Azure.AI.Projects .NET SDK adds native evaluation API support, this class
+/// will be updated to use it for full parity with the Python FoundryEvals class.
+///
+///
+public sealed class FoundryEvals : IAgentEvaluator
+{
+ private readonly ChatConfiguration _chatConfiguration;
+ private readonly string[] _evaluatorNames;
+ private readonly IConversationSplitter? _splitter;
+
+ // -----------------------------------------------------------------------
+ // Constructors
+ // -----------------------------------------------------------------------
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// Chat configuration for the LLM-based evaluators.
+ ///
+ /// Names of evaluators to use (e.g., , ).
+ /// When empty, defaults to relevance and coherence.
+ ///
+ public FoundryEvals(ChatConfiguration chatConfiguration, params string[] evaluators)
+ : this(chatConfiguration, splitter: null, evaluators)
+ {
+ }
+
+ ///
+ /// Initializes a new instance of the class with a default splitter.
+ ///
+ /// Chat configuration for the LLM-based evaluators.
+ ///
+ /// Default conversation splitter for multi-turn conversations. Overridden by
+ /// when set on individual items.
+ /// Use , ,
+ /// or a custom implementation.
+ ///
+ ///
+ /// Names of evaluators to use (e.g., , ).
+ /// When empty, defaults to relevance and coherence.
+ ///
+ public FoundryEvals(ChatConfiguration chatConfiguration, IConversationSplitter? splitter, params string[] evaluators)
+ {
+ this._chatConfiguration = chatConfiguration;
+ this._splitter = splitter;
+ this._evaluatorNames = evaluators.Length > 0
+ ? evaluators
+ : [Relevance, Coherence];
+ }
+
+ // -----------------------------------------------------------------------
+ // IAgentEvaluator
+ // -----------------------------------------------------------------------
+
+ ///
+ public string Name => "FoundryEvals";
+
+ ///
+ public async Task EvaluateAsync(
+ IReadOnlyList items,
+ string evalName = "Foundry Eval",
+ CancellationToken cancellationToken = default)
+ {
+ var meaiEvaluators = BuildEvaluators(this._evaluatorNames);
+ var composite = new CompositeEvaluator(meaiEvaluators.ToArray());
+
+ var results = new List(items.Count);
+
+ foreach (var item in items)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+
+ // Resolve splitter: item-level > evaluator-level > LastTurn default
+ var effectiveSplitter = item.Splitter ?? this._splitter;
+ var (queryMessages, _) = item.Split(effectiveSplitter);
+ var messages = queryMessages.ToList();
+
+ var chatResponse = item.RawResponse
+ ?? new ChatResponse(new ChatMessage(ChatRole.Assistant, item.Response));
+
+ var additionalContext = new List();
+
+ if (item.Context is not null)
+ {
+ additionalContext.Add(new GroundednessEvaluatorContext(item.Context));
+ }
+
+ var result = await composite.EvaluateAsync(
+ messages,
+ chatResponse,
+ this._chatConfiguration,
+ additionalContext: additionalContext.Count > 0 ? additionalContext : null,
+ cancellationToken: cancellationToken).ConfigureAwait(false);
+
+ results.Add(result);
+ }
+
+ return new AgentEvaluationResults(this.Name, results);
+ }
+
+ // -----------------------------------------------------------------------
+ // Evaluator name constants
+ // -----------------------------------------------------------------------
+
+ // Agent behavior
+
+ /// Evaluates whether the agent correctly resolves user intent.
+ public const string IntentResolution = "intent_resolution";
+
+ /// Evaluates whether the agent adheres to its task instructions.
+ public const string TaskAdherence = "task_adherence";
+
+ /// Evaluates whether the agent completes the requested task.
+ public const string TaskCompletion = "task_completion";
+
+ /// Evaluates the efficiency of the agent's navigation to complete the task.
+ public const string TaskNavigationEfficiency = "task_navigation_efficiency";
+
+ // Tool usage
+
+ /// Evaluates the accuracy of tool calls made by the agent.
+ public const string ToolCallAccuracy = "tool_call_accuracy";
+
+ /// Evaluates whether the agent selects the correct tools.
+ public const string ToolSelection = "tool_selection";
+
+ /// Evaluates the accuracy of inputs provided to tools.
+ public const string ToolInputAccuracy = "tool_input_accuracy";
+
+ /// Evaluates how well the agent uses tool outputs.
+ public const string ToolOutputUtilization = "tool_output_utilization";
+
+ /// Evaluates whether tool calls succeed.
+ public const string ToolCallSuccess = "tool_call_success";
+
+ // Quality
+
+ /// Evaluates the coherence of the response.
+ public const string Coherence = "coherence";
+
+ /// Evaluates the fluency of the response.
+ public const string Fluency = "fluency";
+
+ /// Evaluates the relevance of the response to the query.
+ public const string Relevance = "relevance";
+
+ /// Evaluates whether the response is grounded in the provided context.
+ public const string Groundedness = "groundedness";
+
+ /// Evaluates the completeness of the response.
+ public const string ResponseCompleteness = "response_completeness";
+
+ /// Evaluates the similarity between the response and the expected output.
+ public const string Similarity = "similarity";
+
+ // Safety
+
+ /// Evaluates the response for violent content.
+ public const string Violence = "violence";
+
+ /// Evaluates the response for sexual content.
+ public const string Sexual = "sexual";
+
+ /// Evaluates the response for self-harm content.
+ public const string SelfHarm = "self_harm";
+
+ /// Evaluates the response for hate or unfairness.
+ public const string HateUnfairness = "hate_unfairness";
+
+ // -----------------------------------------------------------------------
+ // Internal helpers
+ // -----------------------------------------------------------------------
+
+ internal static List BuildEvaluators(string[] names)
+ {
+ var evaluators = new List();
+ bool hasSafetyEvaluator = false;
+
+ foreach (var name in names)
+ {
+ IEvaluator? evaluator = name switch
+ {
+ Relevance => new RelevanceEvaluator(),
+ Coherence => new CoherenceEvaluator(),
+ Groundedness => new GroundednessEvaluator(),
+ Fluency => new FluencyEvaluator(),
+
+ // ContentHarmEvaluator covers all harm categories in one call — deduplicate
+ Violence or
+ Sexual or
+ SelfHarm or
+ HateUnfairness when !hasSafetyEvaluator => new ContentHarmEvaluator(),
+
+ Violence or
+ Sexual or
+ SelfHarm or
+ HateUnfairness => null,
+
+ _ => throw new ArgumentException(
+ $"Evaluator '{name}' is not supported by the .NET FoundryEvals adapter. " +
+ $"Supported: {Relevance}, {Coherence}, {Groundedness}, {Fluency}, " +
+ $"{Violence}, {Sexual}, {SelfHarm}, {HateUnfairness}.",
+ nameof(names)),
+ };
+
+ if (evaluator is ContentHarmEvaluator)
+ {
+ hasSafetyEvaluator = true;
+ }
+
+ if (evaluator is not null)
+ {
+ evaluators.Add(evaluator);
+ }
+ }
+
+ return evaluators;
+ }
+}
diff --git a/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj b/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj
index 0cd8690126..fce34b7201 100644
--- a/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj
+++ b/dotnet/src/Microsoft.Agents.AI.AzureAI/Microsoft.Agents.AI.AzureAI.csproj
@@ -20,6 +20,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs b/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs
new file mode 100644
index 0000000000..badf6ff642
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI.Workflows/Evaluation/WorkflowEvaluationExtensions.cs
@@ -0,0 +1,135 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.AI.Evaluation;
+
+namespace Microsoft.Agents.AI.Workflows;
+
+///
+/// Extension methods for evaluating workflow runs.
+///
+public static class WorkflowEvaluationExtensions
+{
+ ///
+ /// Evaluates a completed workflow run.
+ ///
+ /// The completed workflow run.
+ /// The evaluator to score results.
+ /// Whether to include an overall evaluation.
+ /// Whether to include per-agent breakdowns.
+ /// Display name for this evaluation run.
+ ///
+ /// Optional conversation splitter to apply to all items.
+ /// Use , ,
+ /// or a custom implementation.
+ ///
+ /// Cancellation token.
+ /// Evaluation results with optional per-agent sub-results.
+ public static async Task EvaluateAsync(
+ this Run run,
+ IAgentEvaluator evaluator,
+ bool includeOverall = true,
+ bool includePerAgent = true,
+ string evalName = "Workflow Eval",
+ IConversationSplitter? splitter = null,
+ CancellationToken cancellationToken = default)
+ {
+ var events = run.OutgoingEvents.ToList();
+
+ // Extract per-agent data
+ var agentData = ExtractAgentData(events, splitter);
+
+ // Build overall items from final output
+ var overallItems = new List();
+ if (includeOverall)
+ {
+ var finalResponse = events.OfType().LastOrDefault();
+ if (finalResponse is not null)
+ {
+ var firstInvoked = events.OfType().FirstOrDefault();
+ var query = firstInvoked?.Data?.ToString() ?? string.Empty;
+ var conversation = new List
+ {
+ new(ChatRole.User, query),
+ new(ChatRole.Assistant, finalResponse.Response.Text),
+ };
+
+ overallItems.Add(new EvalItem(query, finalResponse.Response.Text, conversation)
+ {
+ Splitter = splitter,
+ });
+ }
+ }
+
+ // Evaluate overall
+ var overallResult = overallItems.Count > 0
+ ? await evaluator.EvaluateAsync(overallItems, evalName, cancellationToken).ConfigureAwait(false)
+ : new AgentEvaluationResults(evaluator.Name, Array.Empty());
+
+ // Per-agent breakdown
+ if (includePerAgent && agentData.Count > 0)
+ {
+ var subResults = new Dictionary();
+
+ foreach (var kvp in agentData)
+ {
+ subResults[kvp.Key] = await evaluator.EvaluateAsync(
+ kvp.Value,
+ $"{evalName} - {kvp.Key}",
+ cancellationToken).ConfigureAwait(false);
+ }
+
+ overallResult.SubResults = subResults;
+ }
+
+ return overallResult;
+ }
+
+ internal static Dictionary> ExtractAgentData(
+ List events,
+ IConversationSplitter? splitter)
+ {
+ var invoked = new Dictionary();
+ var agentData = new Dictionary>();
+
+ foreach (var evt in events)
+ {
+ if (evt is ExecutorInvokedEvent invokedEvent)
+ {
+ invoked[invokedEvent.ExecutorId] = invokedEvent;
+ }
+ else if (evt is ExecutorCompletedEvent completedEvent
+ && invoked.TryGetValue(completedEvent.ExecutorId, out var matchingInvoked))
+ {
+ var query = matchingInvoked.Data?.ToString() ?? string.Empty;
+ var responseText = completedEvent.Data?.ToString() ?? string.Empty;
+ var conversation = new List
+ {
+ new(ChatRole.User, query),
+ new(ChatRole.Assistant, responseText),
+ };
+
+ var item = new EvalItem(query, responseText, conversation)
+ {
+ Splitter = splitter,
+ };
+
+ if (!agentData.TryGetValue(completedEvent.ExecutorId, out var items))
+ {
+ items = new List();
+ agentData[completedEvent.ExecutorId] = items;
+ }
+
+ items.Add(item);
+ invoked.Remove(completedEvent.ExecutorId);
+ }
+ }
+
+ return agentData;
+ }
+}
diff --git a/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj b/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj
index c103ead32d..0e4e20e47b 100644
--- a/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj
+++ b/dotnet/src/Microsoft.Agents.AI.Workflows/Microsoft.Agents.AI.Workflows.csproj
@@ -54,4 +54,9 @@
+
+
+
+
+
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs
new file mode 100644
index 0000000000..31904218ad
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationExtensions.cs
@@ -0,0 +1,355 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.AI.Evaluation;
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Extension methods for evaluating agents, responses, and workflow runs.
+///
+public static partial class AgentEvaluationExtensions
+{
+ ///
+ /// Evaluates an agent by running it against test queries and scoring the responses.
+ ///
+ /// The agent to evaluate.
+ /// Test queries to send to the agent.
+ /// The evaluator to score responses.
+ /// Display name for this evaluation run.
+ ///
+ /// Optional ground-truth expected outputs, one per query. When provided,
+ /// must be the same length as . Each value is
+ /// stamped on the corresponding .
+ ///
+ ///
+ /// Optional expected tool calls, one list per query. When provided,
+ /// must be the same length as . Each list is
+ /// stamped on the corresponding .
+ ///
+ ///
+ /// Optional conversation splitter to apply to all items.
+ /// Use , ,
+ /// or a custom implementation.
+ ///
+ ///
+ /// Number of times to run each query (default 1). When greater than 1, each query is invoked
+ /// independently N times to measure consistency. Results contain all N × queries.Count items.
+ ///
+ /// Cancellation token.
+ /// Evaluation results.
+ public static async Task EvaluateAsync(
+ this AIAgent agent,
+ IEnumerable queries,
+ IAgentEvaluator evaluator,
+ string evalName = "Agent Framework Eval",
+ IEnumerable? expectedOutput = null,
+ IEnumerable>? expectedToolCalls = null,
+ IConversationSplitter? splitter = null,
+ int numRepetitions = 1,
+ CancellationToken cancellationToken = default)
+ {
+ var items = await RunAgentForEvalAsync(agent, queries, expectedOutput, expectedToolCalls, splitter, numRepetitions, cancellationToken).ConfigureAwait(false);
+ return await evaluator.EvaluateAsync(items, evalName, cancellationToken).ConfigureAwait(false);
+ }
+
+ ///
+ /// Evaluates an agent using an MEAI evaluator directly.
+ ///
+ /// The agent to evaluate.
+ /// Test queries to send to the agent.
+ /// The MEAI evaluator (e.g., RelevanceEvaluator, CompositeEvaluator).
+ /// Chat configuration for the MEAI evaluator (includes the judge model).
+ /// Display name for this evaluation run.
+ ///
+ /// Optional ground-truth expected outputs, one per query.
+ ///
+ ///
+ /// Optional expected tool calls, one list per query.
+ ///
+ ///
+ /// Optional conversation splitter to apply to all items.
+ /// Use , ,
+ /// or a custom implementation.
+ ///
+ ///
+ /// Number of times to run each query (default 1). When greater than 1, each query is invoked
+ /// independently N times to measure consistency.
+ ///
+ /// Cancellation token.
+ /// Evaluation results.
+ public static async Task EvaluateAsync(
+ this AIAgent agent,
+ IEnumerable queries,
+ IEvaluator evaluator,
+ ChatConfiguration chatConfiguration,
+ string evalName = "Agent Framework Eval",
+ IEnumerable? expectedOutput = null,
+ IEnumerable>? expectedToolCalls = null,
+ IConversationSplitter? splitter = null,
+ int numRepetitions = 1,
+ CancellationToken cancellationToken = default)
+ {
+ var wrapped = new MeaiEvaluatorAdapter(evaluator, chatConfiguration);
+ return await agent.EvaluateAsync(queries, wrapped, evalName, expectedOutput, expectedToolCalls, splitter, numRepetitions, cancellationToken).ConfigureAwait(false);
+ }
+
+ ///
+ /// Evaluates an agent by running it against test queries with multiple evaluators.
+ ///
+ /// The agent to evaluate.
+ /// Test queries to send to the agent.
+ /// The evaluators to score responses.
+ /// Display name for this evaluation run.
+ ///
+ /// Optional ground-truth expected outputs, one per query.
+ ///
+ ///
+ /// Optional expected tool calls, one list per query.
+ ///
+ ///
+ /// Optional conversation splitter to apply to all items.
+ /// Use , ,
+ /// or a custom implementation.
+ ///
+ ///
+ /// Number of times to run each query (default 1). When greater than 1, each query is invoked
+ /// independently N times to measure consistency.
+ ///
+ /// Cancellation token.
+ /// One result per evaluator.
+ public static async Task> EvaluateAsync(
+ this AIAgent agent,
+ IEnumerable queries,
+ IEnumerable evaluators,
+ string evalName = "Agent Framework Eval",
+ IEnumerable? expectedOutput = null,
+ IEnumerable>? expectedToolCalls = null,
+ IConversationSplitter? splitter = null,
+ int numRepetitions = 1,
+ CancellationToken cancellationToken = default)
+ {
+ var items = await RunAgentForEvalAsync(agent, queries, expectedOutput, expectedToolCalls, splitter, numRepetitions, cancellationToken).ConfigureAwait(false);
+
+ var results = new List();
+ foreach (var evaluator in evaluators)
+ {
+ var result = await evaluator.EvaluateAsync(items, evalName, cancellationToken).ConfigureAwait(false);
+ results.Add(result);
+ }
+
+ return results;
+ }
+
+ ///
+ /// Evaluates pre-existing agent responses without re-running the agent.
+ ///
+ /// The agent (used for tool definitions).
+ /// Pre-existing agent responses.
+ /// The queries that produced each response (must match count).
+ /// The evaluator to score responses.
+ /// Display name for this evaluation run.
+ ///
+ /// Optional ground-truth expected outputs, one per query.
+ ///
+ ///
+ /// Optional expected tool calls, one list per query.
+ ///
+ /// Cancellation token.
+ /// Evaluation results.
+ public static async Task EvaluateAsync(
+ this AIAgent agent,
+ IEnumerable responses,
+ IEnumerable queries,
+ IAgentEvaluator evaluator,
+ string evalName = "Agent Framework Eval",
+ IEnumerable? expectedOutput = null,
+ IEnumerable>? expectedToolCalls = null,
+ CancellationToken cancellationToken = default)
+ {
+ var items = BuildItemsFromResponses(agent, responses, queries, expectedOutput, expectedToolCalls);
+ return await evaluator.EvaluateAsync(items, evalName, cancellationToken).ConfigureAwait(false);
+ }
+
+ ///
+ /// Evaluates pre-existing agent responses using an MEAI evaluator directly.
+ ///
+ /// The agent (used for tool definitions).
+ /// Pre-existing agent responses.
+ /// The queries that produced each response (must match count).
+ /// The MEAI evaluator.
+ /// Chat configuration for the MEAI evaluator.
+ /// Display name for this evaluation run.
+ ///
+ /// Optional ground-truth expected outputs, one per query.
+ ///
+ ///
+ /// Optional expected tool calls, one list per query.
+ ///
+ /// Cancellation token.
+ /// Evaluation results.
+ public static async Task EvaluateAsync(
+ this AIAgent agent,
+ IEnumerable responses,
+ IEnumerable queries,
+ IEvaluator evaluator,
+ ChatConfiguration chatConfiguration,
+ string evalName = "Agent Framework Eval",
+ IEnumerable? expectedOutput = null,
+ IEnumerable>? expectedToolCalls = null,
+ CancellationToken cancellationToken = default)
+ {
+ var wrapped = new MeaiEvaluatorAdapter(evaluator, chatConfiguration);
+ return await agent.EvaluateAsync(responses, queries, wrapped, evalName, expectedOutput, expectedToolCalls, cancellationToken).ConfigureAwait(false);
+ }
+
+ internal static List BuildItemsFromResponses(
+ AIAgent agent,
+ IEnumerable responses,
+ IEnumerable queries,
+ IEnumerable? expectedOutput,
+ IEnumerable>? expectedToolCalls)
+ {
+ var responseList = responses.ToList();
+ var queryList = queries.ToList();
+ var expectedList = expectedOutput?.ToList();
+ var expectedToolCallsList = expectedToolCalls?.ToList();
+
+ if (responseList.Count != queryList.Count)
+ {
+ throw new ArgumentException(
+ $"Got {queryList.Count} queries but {responseList.Count} responses. Counts must match.");
+ }
+
+ if (expectedList != null && expectedList.Count != queryList.Count)
+ {
+ throw new ArgumentException(
+ $"Got {queryList.Count} queries but {expectedList.Count} expectedOutput values. Counts must match.");
+ }
+
+ if (expectedToolCallsList != null && expectedToolCallsList.Count != queryList.Count)
+ {
+ throw new ArgumentException(
+ $"Got {queryList.Count} queries but {expectedToolCallsList.Count} expectedToolCalls lists. Counts must match.");
+ }
+
+ var items = new List();
+ for (int i = 0; i < responseList.Count; i++)
+ {
+ var query = queryList[i];
+ var response = responseList[i];
+
+ var messages = new List
+ {
+ new(ChatRole.User, query),
+ };
+ messages.AddRange(response.Messages);
+
+ var item = BuildEvalItem(query, response, messages, agent);
+ if (expectedList != null)
+ {
+ item.ExpectedOutput = expectedList[i];
+ }
+
+ if (expectedToolCallsList != null)
+ {
+ item.ExpectedToolCalls = expectedToolCallsList[i].ToList();
+ }
+
+ items.Add(item);
+ }
+
+ return items;
+ }
+
+ private static async Task> RunAgentForEvalAsync(
+ AIAgent agent,
+ IEnumerable queries,
+ IEnumerable? expectedOutput,
+ IEnumerable>? expectedToolCalls,
+ IConversationSplitter? splitter,
+ int numRepetitions,
+ CancellationToken cancellationToken)
+ {
+ if (numRepetitions < 1)
+ {
+ throw new ArgumentException($"numRepetitions must be >= 1, got {numRepetitions}.", nameof(numRepetitions));
+ }
+
+ var items = new List();
+ var queryList = queries.ToList();
+ var expectedList = expectedOutput?.ToList();
+ var expectedToolCallsList = expectedToolCalls?.ToList();
+
+ if (expectedList != null && expectedList.Count != queryList.Count)
+ {
+ throw new ArgumentException(
+ $"Got {queryList.Count} queries but {expectedList.Count} expectedOutput values. Counts must match.");
+ }
+
+ if (expectedToolCallsList != null && expectedToolCallsList.Count != queryList.Count)
+ {
+ throw new ArgumentException(
+ $"Got {queryList.Count} queries but {expectedToolCallsList.Count} expectedToolCalls lists. Counts must match.");
+ }
+
+ for (int rep = 0; rep < numRepetitions; rep++)
+ {
+ for (int i = 0; i < queryList.Count; i++)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+
+ var query = queryList[i];
+ var messages = new List
+ {
+ new(ChatRole.User, query),
+ };
+
+ var response = await agent.RunAsync(messages, cancellationToken: cancellationToken).ConfigureAwait(false);
+ var item = BuildEvalItem(query, response, messages, agent);
+ item.Splitter = splitter;
+ if (expectedList != null)
+ {
+ item.ExpectedOutput = expectedList[i];
+ }
+
+ if (expectedToolCallsList != null)
+ {
+ item.ExpectedToolCalls = expectedToolCallsList[i].ToList();
+ }
+
+ items.Add(item);
+ }
+ }
+
+ return items;
+ }
+
+ internal static EvalItem BuildEvalItem(
+ string query,
+ AgentResponse response,
+ List messages,
+ AIAgent agent)
+ {
+ // Build conversation from existing messages plus any new response messages
+ var conversation = new List(messages);
+ foreach (var msg in response.Messages)
+ {
+ if (!conversation.Contains(msg))
+ {
+ conversation.Add(msg);
+ }
+ }
+
+ return new EvalItem(query, response.Text, conversation)
+ {
+ RawResponse = new ChatResponse(response.Messages.LastOrDefault()
+ ?? new ChatMessage(ChatRole.Assistant, response.Text)),
+ };
+ }
+}
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs
new file mode 100644
index 0000000000..c46bc8046b
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/AgentEvaluationResults.cs
@@ -0,0 +1,127 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.Extensions.AI.Evaluation;
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Aggregate evaluation results across multiple items.
+///
+public sealed class AgentEvaluationResults
+{
+ private readonly List _items;
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// Name of the evaluation provider.
+ /// Per-item MEAI evaluation results.
+ /// The original eval items that were evaluated, for auditing.
+ public AgentEvaluationResults(string provider, IEnumerable items, IReadOnlyList? inputItems = null)
+ {
+ this.Provider = provider;
+ this._items = new List(items);
+ this.InputItems = inputItems;
+ }
+
+ /// Gets the evaluation provider name.
+ public string Provider { get; }
+
+ /// Gets the portal URL for viewing results (Foundry only).
+ public Uri? ReportUrl { get; set; }
+
+ /// Gets the per-item MEAI evaluation results.
+ public IReadOnlyList Items => this._items;
+
+ ///
+ /// Gets the original eval items that produced these results, for auditing.
+ /// Each entry corresponds positionally to — InputItems[i]
+ /// is the query/response that produced Items[i].
+ ///
+ public IReadOnlyList? InputItems { get; }
+
+ /// Gets per-agent results for workflow evaluations.
+ public IReadOnlyDictionary? SubResults { get; set; }
+
+ /// Gets the number of items that passed.
+ public int Passed => this._items.Count(ItemPassed);
+
+ /// Gets the number of items that failed.
+ public int Failed => this._items.Count(i => !ItemPassed(i));
+
+ /// Gets the total number of items evaluated.
+ public int Total => this._items.Count;
+
+ /// Gets whether all items passed.
+ public bool AllPassed
+ {
+ get
+ {
+ if (this.SubResults is not null)
+ {
+ return this.SubResults.Values.All(s => s.AllPassed)
+ && (this.Total == 0 || this.Failed == 0);
+ }
+
+ return this.Total > 0 && this.Failed == 0;
+ }
+ }
+
+ ///
+ /// Asserts that all items passed. Throws on failure.
+ ///
+ /// Optional custom failure message.
+ /// Thrown when any items failed.
+ public void AssertAllPassed(string? message = null)
+ {
+ if (!this.AllPassed)
+ {
+ var detail = message ?? $"{this.Provider}: {this.Passed} passed, {this.Failed} failed out of {this.Total}.";
+ if (this.ReportUrl is not null)
+ {
+ detail += $" See {this.ReportUrl} for details.";
+ }
+
+ if (this.SubResults is not null)
+ {
+ var failedAgents = this.SubResults
+ .Where(kvp => !kvp.Value.AllPassed)
+ .Select(kvp => kvp.Key);
+ detail += $" Failed agents: {string.Join(", ", failedAgents)}.";
+ }
+
+ throw new InvalidOperationException(detail);
+ }
+ }
+
+ private static bool ItemPassed(EvaluationResult result)
+ {
+ foreach (var metric in result.Metrics.Values)
+ {
+ if (metric.Interpretation?.Failed == true)
+ {
+ return false;
+ }
+
+ if (metric is NumericMetric numeric && numeric.Value.HasValue)
+ {
+ if (numeric.Value.Value < 3.0)
+ {
+ return false;
+ }
+ }
+ else if (metric is BooleanMetric boolean && boolean.Value.HasValue)
+ {
+ if (!boolean.Value.Value)
+ {
+ return false;
+ }
+ }
+ }
+
+ return result.Metrics.Count > 0;
+ }
+}
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/CheckResult.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/CheckResult.cs
new file mode 100644
index 0000000000..46f47bb3c9
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/CheckResult.cs
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Result of a single check on a single evaluation item.
+///
+/// Whether the check passed.
+/// Human-readable explanation.
+/// Name of the check that produced this result.
+public sealed record EvalCheckResult(bool Passed, string Reason, string CheckName);
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalCheck.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalCheck.cs
new file mode 100644
index 0000000000..eae0750418
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalCheck.cs
@@ -0,0 +1,10 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Delegate for a synchronous evaluation check on a single item.
+///
+/// The evaluation item.
+/// The check result.
+public delegate EvalCheckResult EvalCheck(EvalItem item);
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalChecks.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalChecks.cs
new file mode 100644
index 0000000000..5dfa2da612
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalChecks.cs
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.Extensions.AI;
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Built-in check functions for common evaluation patterns.
+///
+public static class EvalChecks
+{
+ ///
+ /// Creates a check that verifies the response contains all specified keywords.
+ ///
+ /// Keywords that must appear in the response.
+ /// An delegate.
+ public static EvalCheck KeywordCheck(params string[] keywords)
+ {
+ return KeywordCheck(caseSensitive: false, keywords);
+ }
+
+ ///
+ /// Creates a check that verifies the response contains all specified keywords.
+ ///
+ /// Whether the comparison is case-sensitive.
+ /// Keywords that must appear in the response.
+ /// An delegate.
+ public static EvalCheck KeywordCheck(bool caseSensitive, params string[] keywords)
+ {
+ return (EvalItem item) =>
+ {
+ var comparison = caseSensitive
+ ? StringComparison.Ordinal
+ : StringComparison.OrdinalIgnoreCase;
+
+ var missing = keywords
+ .Where(kw => !item.Response.Contains(kw, comparison))
+ .ToList();
+
+ var passed = missing.Count == 0;
+ var reason = passed
+ ? $"All keywords found: {string.Join(", ", keywords)}"
+ : $"Missing keywords: {string.Join(", ", missing)}";
+
+ return new EvalCheckResult(passed, reason, "keyword_check");
+ };
+ }
+
+ ///
+ /// Creates a check that verifies specific tools were called in the conversation.
+ ///
+ /// Tool names that must appear in the conversation.
+ /// An delegate.
+ public static EvalCheck ToolCalledCheck(params string[] toolNames)
+ {
+ return (EvalItem item) =>
+ {
+ var calledTools = new HashSet(StringComparer.OrdinalIgnoreCase);
+
+ foreach (var message in item.Conversation)
+ {
+ foreach (var content in message.Contents)
+ {
+ if (content is FunctionCallContent functionCall)
+ {
+ calledTools.Add(functionCall.Name);
+ }
+ }
+ }
+
+ var missing = toolNames
+ .Where(t => !calledTools.Contains(t))
+ .ToList();
+
+ var passed = missing.Count == 0;
+ var reason = passed
+ ? $"All tools called: {string.Join(", ", toolNames)}"
+ : $"Missing tool calls: {string.Join(", ", missing)}";
+
+ return new EvalCheckResult(passed, reason, "tool_called_check");
+ };
+ }
+}
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs
new file mode 100644
index 0000000000..93e860ae65
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/EvalItem.cs
@@ -0,0 +1,140 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.Extensions.AI;
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Provider-agnostic data for a single evaluation item.
+///
+public sealed class EvalItem
+{
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The user query.
+ /// The agent response text.
+ /// The full conversation as list.
+ public EvalItem(string query, string response, IReadOnlyList conversation)
+ {
+ this.Query = query;
+ this.Response = response;
+ this.Conversation = conversation;
+ }
+
+ /// Gets the user query.
+ public string Query { get; }
+
+ /// Gets the agent response text.
+ public string Response { get; }
+
+ /// Gets the full conversation history.
+ public IReadOnlyList Conversation { get; }
+
+ /// Gets or sets the tools available to the agent.
+ public IReadOnlyList? Tools { get; set; }
+
+ /// Gets or sets grounding context for evaluation.
+ public string? Context { get; set; }
+
+ /// Gets or sets the expected output for ground-truth comparison.
+ public string? ExpectedOutput { get; set; }
+
+ ///
+ /// Gets or sets the expected tool calls for tool-correctness evaluation.
+ ///
+ ///
+ /// Each entry describes a tool call the agent should make. The evaluator
+ /// decides matching semantics (ordering, extras, argument checking).
+ /// See .
+ ///
+ public IReadOnlyList? ExpectedToolCalls { get; set; }
+
+ /// Gets or sets the raw chat response for MEAI evaluators.
+ public ChatResponse? RawResponse { get; set; }
+
+ ///
+ /// Gets or sets the conversation splitter for this item.
+ ///
+ ///
+ /// When set by orchestration functions (e.g. EvaluateAsync(splitter: ...)),
+ /// this is used as the default by .
+ /// Priority: explicit Split(splitter) argument >
+ /// > .
+ ///
+ public IConversationSplitter? Splitter { get; set; }
+
+ ///
+ /// Splits the conversation into query messages and response messages.
+ ///
+ ///
+ /// The splitter to use. When null, uses
+ /// if set, otherwise .
+ ///
+ /// A tuple of (query messages, response messages).
+ public (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split(
+ IConversationSplitter? splitter = null)
+ {
+ var effective = splitter ?? this.Splitter ?? ConversationSplitters.LastTurn;
+ return effective.Split(this.Conversation);
+ }
+
+ ///
+ /// Splits a multi-turn conversation into one per user turn.
+ ///
+ ///
+ /// Each user message starts a new turn. The resulting item has cumulative context:
+ /// query messages contain the full conversation up to and including that user message,
+ /// and the response is everything up to the next user message.
+ ///
+ /// The full conversation to split.
+ /// Optional tools available to the agent.
+ /// Optional grounding context.
+ /// A list of eval items, one per user turn.
+ public static IReadOnlyList PerTurnItems(
+ IReadOnlyList conversation,
+ IReadOnlyList? tools = null,
+ string? context = null)
+ {
+ var items = new List();
+ var userIndices = new List();
+
+ for (int i = 0; i < conversation.Count; i++)
+ {
+ if (conversation[i].Role == ChatRole.User)
+ {
+ userIndices.Add(i);
+ }
+ }
+
+ for (int t = 0; t < userIndices.Count; t++)
+ {
+ int userIdx = userIndices[t];
+ int nextBoundary = t + 1 < userIndices.Count
+ ? userIndices[t + 1]
+ : conversation.Count;
+
+ var responseMessages = conversation.Skip(userIdx + 1).Take(nextBoundary - userIdx - 1).ToList();
+
+ var query = conversation[userIdx].Text ?? string.Empty;
+ var responseText = string.Join(
+ " ",
+ responseMessages
+ .Where(m => m.Role == ChatRole.Assistant && !string.IsNullOrEmpty(m.Text))
+ .Select(m => m.Text));
+
+ var fullSlice = conversation.Take(nextBoundary).ToList();
+ var item = new EvalItem(query, responseText, fullSlice)
+ {
+ Tools = tools,
+ Context = context,
+ };
+
+ items.Add(item);
+ }
+
+ return items;
+ }
+}
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/ExpectedToolCall.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/ExpectedToolCall.cs
new file mode 100644
index 0000000000..9b30899df4
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/ExpectedToolCall.cs
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Collections.Generic;
+
+namespace Microsoft.Agents.AI;
+
+///
+/// A tool call that an agent is expected to make.
+///
+///
+/// Used with EvaluateAsync to assert that the agent called the correct tools.
+/// The evaluator decides matching semantics (order, extras, argument checking);
+/// this type is pure data.
+///
+/// The tool/function name (e.g. "get_weather").
+///
+/// Expected arguments. null means "don't check arguments".
+/// When provided, evaluators typically do subset matching (all expected keys must be present).
+///
+public record ExpectedToolCall(string Name, IReadOnlyDictionary? Arguments = null);
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/FunctionEvaluator.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/FunctionEvaluator.cs
new file mode 100644
index 0000000000..a9024c7750
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/FunctionEvaluator.cs
@@ -0,0 +1,68 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Factory for creating delegates from typed lambda functions.
+///
+public static class FunctionEvaluator
+{
+ ///
+ /// Creates a check from a function that takes the response text and returns a bool.
+ ///
+ /// Check name for reporting.
+ /// Function that returns true if the response passes.
+ public static EvalCheck Create(string name, Func check)
+ {
+ return (EvalItem item) =>
+ {
+ var passed = check(item.Response);
+ return new EvalCheckResult(passed, passed ? "Passed" : "Failed", name);
+ };
+ }
+
+ ///
+ /// Creates a check from a function that takes response and expected text.
+ ///
+ /// Check name for reporting.
+ /// Function that returns true if the response passes.
+ public static EvalCheck Create(string name, Func check)
+ {
+ return (EvalItem item) =>
+ {
+ var passed = check(item.Response, item.ExpectedOutput);
+ return new EvalCheckResult(passed, passed ? "Passed" : "Failed", name);
+ };
+ }
+
+ ///
+ /// Creates a check from a function that takes the full .
+ ///
+ /// Check name for reporting.
+ /// Function that returns true if the item passes.
+ public static EvalCheck Create(string name, Func check)
+ {
+ return (EvalItem item) =>
+ {
+ var passed = check(item);
+ return new EvalCheckResult(passed, passed ? "Passed" : "Failed", name);
+ };
+ }
+
+ ///
+ /// Creates a check from a function that takes the full
+ /// and returns a .
+ ///
+ /// Check name (used as fallback if the result has no name).
+ /// Function that returns a full check result.
+ public static EvalCheck Create(string name, Func check)
+ {
+ return (EvalItem item) =>
+ {
+ var result = check(item);
+ return result with { CheckName = result.CheckName ?? name };
+ };
+ }
+}
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/IAgentEvaluator.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/IAgentEvaluator.cs
new file mode 100644
index 0000000000..2dc84e35eb
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/IAgentEvaluator.cs
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Collections.Generic;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Batch-oriented evaluator interface for agent evaluation.
+///
+///
+/// Unlike MEAI's IEvaluator which evaluates one item at a time,
+/// evaluates a batch of items. This enables
+/// efficient cloud-based evaluation (e.g., Foundry) and aggregate result computation.
+///
+public interface IAgentEvaluator
+{
+ /// Gets the evaluator name.
+ string Name { get; }
+
+ ///
+ /// Evaluates a batch of items and returns aggregate results.
+ ///
+ /// The items to evaluate.
+ /// A display name for this evaluation run.
+ /// Cancellation token.
+ /// Aggregate evaluation results.
+ Task EvaluateAsync(
+ IReadOnlyList items,
+ string evalName = "Agent Framework Eval",
+ CancellationToken cancellationToken = default);
+}
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/IConversationSplitter.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/IConversationSplitter.cs
new file mode 100644
index 0000000000..f07282e4de
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/IConversationSplitter.cs
@@ -0,0 +1,103 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.Extensions.AI;
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Strategy for splitting a conversation into query and response halves for evaluation.
+///
+///
+/// Use one of the built-in splitters from or implement
+/// your own for domain-specific splitting logic (e.g., splitting before a memory-retrieval
+/// tool call to evaluate recall quality).
+///
+public interface IConversationSplitter
+{
+ ///
+ /// Splits a conversation into query messages and response messages.
+ ///
+ /// The full conversation to split.
+ /// A tuple of (query messages, response messages).
+ (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split(
+ IReadOnlyList conversation);
+}
+
+///
+/// Built-in conversation splitters for common evaluation patterns.
+///
+///
+///
+/// - : Evaluates whether the agent answered the latest question well.
+/// - : Evaluates whether the whole conversation trajectory served the original request.
+///
+/// For custom splits, implement directly.
+///
+public static class ConversationSplitters
+{
+ ///
+ /// Split at the last user message. Everything up to and including that message
+ /// is the query; everything after is the response. This is the default strategy.
+ ///
+ public static IConversationSplitter LastTurn { get; } = new LastTurnSplitter();
+
+ ///
+ /// The first user message (and any preceding system messages) is the query;
+ /// the entire remainder of the conversation is the response.
+ /// Evaluates overall conversation trajectory.
+ ///
+ public static IConversationSplitter Full { get; } = new FullSplitter();
+
+ private sealed class LastTurnSplitter : IConversationSplitter
+ {
+ public (IReadOnlyList, IReadOnlyList) Split(
+ IReadOnlyList conversation)
+ {
+ int lastUserIdx = -1;
+ for (int i = 0; i < conversation.Count; i++)
+ {
+ if (conversation[i].Role == ChatRole.User)
+ {
+ lastUserIdx = i;
+ }
+ }
+
+ if (lastUserIdx >= 0)
+ {
+ return (
+ conversation.Take(lastUserIdx + 1).ToList(),
+ conversation.Skip(lastUserIdx + 1).ToList());
+ }
+
+ return (new List(), conversation.ToList());
+ }
+ }
+
+ private sealed class FullSplitter : IConversationSplitter
+ {
+ public (IReadOnlyList, IReadOnlyList) Split(
+ IReadOnlyList conversation)
+ {
+ int firstUserIdx = -1;
+ for (int i = 0; i < conversation.Count; i++)
+ {
+ if (conversation[i].Role == ChatRole.User)
+ {
+ firstUserIdx = i;
+ break;
+ }
+ }
+
+ if (firstUserIdx >= 0)
+ {
+ return (
+ conversation.Take(firstUserIdx + 1).ToList(),
+ conversation.Skip(firstUserIdx + 1).ToList());
+ }
+
+ return (new List(), conversation.ToList());
+ }
+ }
+}
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/LocalEvaluator.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/LocalEvaluator.cs
new file mode 100644
index 0000000000..2b664b0e3b
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/LocalEvaluator.cs
@@ -0,0 +1,66 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Collections.Generic;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.Extensions.AI.Evaluation;
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Evaluator that runs check functions locally without API calls.
+///
+public sealed class LocalEvaluator : IAgentEvaluator
+{
+ private readonly EvalCheck[] _checks;
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The check functions to run on each item.
+ public LocalEvaluator(params EvalCheck[] checks)
+ {
+ this._checks = checks;
+ }
+
+ ///
+ public string Name => "LocalEvaluator";
+
+ ///
+ public Task EvaluateAsync(
+ IReadOnlyList items,
+ string evalName = "Local Eval",
+ CancellationToken cancellationToken = default)
+ {
+ var results = new List(items.Count);
+
+ foreach (var item in items)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+
+ var evalResult = new EvaluationResult();
+
+ foreach (var check in this._checks)
+ {
+ var EvalCheckResult = check(item);
+ evalResult.Metrics[EvalCheckResult.CheckName] = new BooleanMetric(
+ EvalCheckResult.CheckName,
+ EvalCheckResult.Passed,
+ reason: EvalCheckResult.Reason)
+ {
+ Interpretation = new EvaluationMetricInterpretation
+ {
+ Rating = EvalCheckResult.Passed
+ ? EvaluationRating.Good
+ : EvaluationRating.Unacceptable,
+ Failed = !EvalCheckResult.Passed,
+ },
+ };
+ }
+
+ results.Add(evalResult);
+ }
+
+ return Task.FromResult(new AgentEvaluationResults(this.Name, results, inputItems: items));
+ }
+}
diff --git a/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs b/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs
new file mode 100644
index 0000000000..e2a6ea67e4
--- /dev/null
+++ b/dotnet/src/Microsoft.Agents.AI/Evaluation/MeaiEvaluatorAdapter.cs
@@ -0,0 +1,63 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.AI.Evaluation;
+
+namespace Microsoft.Agents.AI;
+
+///
+/// Adapter that wraps an MEAI into an .
+/// Runs the MEAI evaluator per-item and aggregates results.
+///
+internal sealed class MeaiEvaluatorAdapter : IAgentEvaluator
+{
+ private readonly IEvaluator _evaluator;
+ private readonly ChatConfiguration _chatConfiguration;
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The MEAI evaluator to wrap.
+ /// Chat configuration for the evaluator (includes the judge model).
+ public MeaiEvaluatorAdapter(IEvaluator evaluator, ChatConfiguration chatConfiguration)
+ {
+ this._evaluator = evaluator;
+ this._chatConfiguration = chatConfiguration;
+ }
+
+ ///
+ public string Name => this._evaluator.GetType().Name;
+
+ ///
+ public async Task EvaluateAsync(
+ IReadOnlyList items,
+ string evalName = "MEAI Eval",
+ CancellationToken cancellationToken = default)
+ {
+ var results = new List(items.Count);
+
+ foreach (var item in items)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+
+ var (queryMessages, _) = item.Split();
+ var messages = queryMessages.ToList();
+ var chatResponse = item.RawResponse
+ ?? new ChatResponse(new ChatMessage(ChatRole.Assistant, item.Response));
+
+ var result = await this._evaluator.EvaluateAsync(
+ messages,
+ chatResponse,
+ this._chatConfiguration,
+ cancellationToken: cancellationToken).ConfigureAwait(false);
+
+ results.Add(result);
+ }
+
+ return new AgentEvaluationResults(this.Name, results);
+ }
+}
diff --git a/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj b/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj
index 70da404a61..a111ce8c2d 100644
--- a/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj
+++ b/dotnet/src/Microsoft.Agents.AI/Microsoft.Agents.AI.csproj
@@ -31,6 +31,14 @@
+
+
+
+
+
+
+
+
Microsoft Agent Framework
diff --git a/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs b/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs
new file mode 100644
index 0000000000..00c3519f3f
--- /dev/null
+++ b/dotnet/tests/Microsoft.Agents.AI.UnitTests/EvaluationTests.cs
@@ -0,0 +1,1112 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.AI.Evaluation;
+
+namespace Microsoft.Agents.AI.UnitTests;
+
+///
+/// Tests for the evaluation types: , ,
+/// , and .
+///
+public sealed class EvaluationTests
+{
+ private static EvalItem CreateItem(
+ string query = "What is the weather?",
+ string response = "The weather in Seattle is sunny and 72°F.",
+ IReadOnlyList? conversation = null)
+ {
+ conversation ??= new List
+ {
+ new(ChatRole.User, query),
+ new(ChatRole.Assistant, response),
+ };
+
+ return new EvalItem(query, response, conversation);
+ }
+
+ // ---------------------------------------------------------------
+ // EvalItem tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void EvalItem_Constructor_SetsProperties()
+ {
+ // Arrange & Act
+ var item = CreateItem();
+
+ // Assert
+ Assert.Equal("What is the weather?", item.Query);
+ Assert.Equal("The weather in Seattle is sunny and 72°F.", item.Response);
+ Assert.Equal(2, item.Conversation.Count);
+ Assert.Null(item.ExpectedOutput);
+ Assert.Null(item.Context);
+ Assert.Null(item.Tools);
+ }
+
+ [Fact]
+ public void EvalItem_OptionalProperties_CanBeSet()
+ {
+ // Arrange & Act
+ var item = CreateItem();
+ item.ExpectedOutput = "sunny";
+ item.Context = "Weather data for Seattle";
+
+ // Assert
+ Assert.Equal("sunny", item.ExpectedOutput);
+ Assert.Equal("Weather data for Seattle", item.Context);
+ }
+
+ // ---------------------------------------------------------------
+ // LocalEvaluator tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public async Task LocalEvaluator_WithPassingCheck_ReturnsPassedResultAsync()
+ {
+ // Arrange
+ var evaluator = new LocalEvaluator(
+ FunctionEvaluator.Create("always_pass", (string _) => true));
+
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.Equal("LocalEvaluator", results.Provider);
+ Assert.Equal(1, results.Total);
+ Assert.Equal(1, results.Passed);
+ Assert.Equal(0, results.Failed);
+ Assert.True(results.AllPassed);
+ }
+
+ [Fact]
+ public async Task LocalEvaluator_WithFailingCheck_ReturnsFailedResultAsync()
+ {
+ // Arrange
+ var evaluator = new LocalEvaluator(
+ FunctionEvaluator.Create("always_fail", (string _) => false));
+
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.Equal(1, results.Total);
+ Assert.Equal(0, results.Passed);
+ Assert.Equal(1, results.Failed);
+ Assert.False(results.AllPassed);
+ }
+
+ [Fact]
+ public async Task LocalEvaluator_WithMultipleChecks_AllChecksRunAsync()
+ {
+ // Arrange
+ var evaluator = new LocalEvaluator(
+ FunctionEvaluator.Create("check1", (string _) => true),
+ FunctionEvaluator.Create("check2", (string _) => true));
+
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.Equal(1, results.Total);
+ Assert.True(results.AllPassed);
+ var itemResult = results.Items[0];
+ Assert.Equal(2, itemResult.Metrics.Count);
+ Assert.True(itemResult.Metrics.ContainsKey("check1"));
+ Assert.True(itemResult.Metrics.ContainsKey("check2"));
+ }
+
+ [Fact]
+ public async Task LocalEvaluator_WithMultipleItems_EvaluatesAllAsync()
+ {
+ // Arrange
+ var evaluator = new LocalEvaluator(
+ EvalChecks.KeywordCheck("weather"));
+
+ var items = new List
+ {
+ CreateItem(response: "The weather is sunny."),
+ CreateItem(response: "I don't know about that topic."),
+ };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.Equal(2, results.Total);
+ Assert.Equal(1, results.Passed);
+ Assert.Equal(1, results.Failed);
+ }
+
+ // ---------------------------------------------------------------
+ // FunctionEvaluator tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public async Task FunctionEvaluator_ResponseOnly_PassesResponseAsync()
+ {
+ // Arrange
+ var check = FunctionEvaluator.Create("length_check",
+ (string response) => response.Length > 10);
+
+ var evaluator = new LocalEvaluator(check);
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.True(results.AllPassed);
+ }
+
+ [Fact]
+ public async Task FunctionEvaluator_WithExpected_PassesExpectedAsync()
+ {
+ // Arrange
+ var check = FunctionEvaluator.Create("contains_expected",
+ (string response, string? expectedOutput) =>
+ expectedOutput != null && response.Contains(expectedOutput, StringComparison.OrdinalIgnoreCase));
+
+ var evaluator = new LocalEvaluator(check);
+ var item = CreateItem();
+ item.ExpectedOutput = "sunny";
+ var items = new List { item };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.True(results.AllPassed);
+ }
+
+ [Fact]
+ public async Task FunctionEvaluator_FullItem_AccessesAllFieldsAsync()
+ {
+ // Arrange
+ var check = FunctionEvaluator.Create("full_check",
+ (EvalItem item) => item.Query.Contains("weather", StringComparison.OrdinalIgnoreCase)
+ && item.Response.Length > 0);
+
+ var evaluator = new LocalEvaluator(check);
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.True(results.AllPassed);
+ }
+
+ [Fact]
+ public async Task FunctionEvaluator_WithCheckResult_ReturnsCustomReasonAsync()
+ {
+ // Arrange
+ var check = FunctionEvaluator.Create("custom_check",
+ (EvalItem item) => new EvalCheckResult(true, "Custom reason", "custom_check"));
+
+ var evaluator = new LocalEvaluator(check);
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.True(results.AllPassed);
+ var metric = results.Items[0].Get("custom_check");
+ Assert.Equal("Custom reason", metric.Reason);
+ }
+
+ // ---------------------------------------------------------------
+ // EvalChecks tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public async Task KeywordCheck_AllKeywordsPresent_PassesAsync()
+ {
+ // Arrange
+ var evaluator = new LocalEvaluator(
+ EvalChecks.KeywordCheck("weather", "sunny"));
+
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.True(results.AllPassed);
+ }
+
+ [Fact]
+ public async Task KeywordCheck_MissingKeyword_FailsAsync()
+ {
+ // Arrange
+ var evaluator = new LocalEvaluator(
+ EvalChecks.KeywordCheck("snow"));
+
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.False(results.AllPassed);
+ }
+
+ [Fact]
+ public async Task KeywordCheck_CaseInsensitiveByDefault_PassesAsync()
+ {
+ // Arrange
+ var evaluator = new LocalEvaluator(
+ EvalChecks.KeywordCheck("WEATHER", "SUNNY"));
+
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.True(results.AllPassed);
+ }
+
+ [Fact]
+ public async Task KeywordCheck_CaseSensitive_FailsOnWrongCaseAsync()
+ {
+ // Arrange
+ var evaluator = new LocalEvaluator(
+ EvalChecks.KeywordCheck(caseSensitive: true, "WEATHER"));
+
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.False(results.AllPassed);
+ }
+
+ [Fact]
+ public async Task ToolCalledCheck_ToolPresent_PassesAsync()
+ {
+ // Arrange
+ var conversation = new List
+ {
+ new(ChatRole.User, "What is the weather?"),
+ new(ChatRole.Assistant, new List
+ {
+ new FunctionCallContent("call1", "get_weather", new Dictionary { ["city"] = "Seattle" }),
+ }),
+ new(ChatRole.Tool, new List
+ {
+ new FunctionResultContent("call1", "72°F and sunny"),
+ }),
+ new(ChatRole.Assistant, "The weather is sunny and 72°F."),
+ };
+
+ var item = CreateItem(conversation: conversation);
+ var evaluator = new LocalEvaluator(
+ EvalChecks.ToolCalledCheck("get_weather"));
+
+ // Act
+ var results = await evaluator.EvaluateAsync(new List { item });
+
+ // Assert
+ Assert.True(results.AllPassed);
+ }
+
+ [Fact]
+ public async Task ToolCalledCheck_ToolMissing_FailsAsync()
+ {
+ // Arrange
+ var evaluator = new LocalEvaluator(
+ EvalChecks.ToolCalledCheck("get_weather"));
+
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.False(results.AllPassed);
+ }
+
+ // ---------------------------------------------------------------
+ // AgentEvaluationResults tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void AgentEvaluationResults_AllPassed_WhenAllMetricsGood()
+ {
+ // Arrange
+ var evalResult = new EvaluationResult();
+ evalResult.Metrics["check"] = new BooleanMetric("check", true)
+ {
+ Interpretation = new EvaluationMetricInterpretation
+ {
+ Rating = EvaluationRating.Good,
+ Failed = false,
+ },
+ };
+
+ // Act
+ var results = new AgentEvaluationResults("test", new[] { evalResult });
+
+ // Assert
+ Assert.True(results.AllPassed);
+ Assert.Equal(1, results.Passed);
+ Assert.Equal(0, results.Failed);
+ }
+
+ [Fact]
+ public void AgentEvaluationResults_NotAllPassed_WhenMetricFailed()
+ {
+ // Arrange
+ var evalResult = new EvaluationResult();
+ evalResult.Metrics["check"] = new BooleanMetric("check", false)
+ {
+ Interpretation = new EvaluationMetricInterpretation
+ {
+ Rating = EvaluationRating.Unacceptable,
+ Failed = true,
+ },
+ };
+
+ // Act
+ var results = new AgentEvaluationResults("test", new[] { evalResult });
+
+ // Assert
+ Assert.False(results.AllPassed);
+ Assert.Equal(0, results.Passed);
+ Assert.Equal(1, results.Failed);
+ }
+
+ [Fact]
+ public void AssertAllPassed_ThrowsOnFailure()
+ {
+ // Arrange
+ var evalResult = new EvaluationResult();
+ evalResult.Metrics["check"] = new BooleanMetric("check", false)
+ {
+ Interpretation = new EvaluationMetricInterpretation
+ {
+ Rating = EvaluationRating.Unacceptable,
+ Failed = true,
+ },
+ };
+
+ var results = new AgentEvaluationResults("test", new[] { evalResult });
+
+ // Act & Assert
+ var ex = Assert.Throws(() => results.AssertAllPassed());
+ Assert.Contains("0 passed", ex.Message);
+ Assert.Contains("1 failed", ex.Message);
+ }
+
+ [Fact]
+ public void AssertAllPassed_DoesNotThrowOnSuccess()
+ {
+ // Arrange
+ var evalResult = new EvaluationResult();
+ evalResult.Metrics["check"] = new BooleanMetric("check", true)
+ {
+ Interpretation = new EvaluationMetricInterpretation
+ {
+ Rating = EvaluationRating.Good,
+ Failed = false,
+ },
+ };
+
+ var results = new AgentEvaluationResults("test", new[] { evalResult });
+
+ // Act & Assert (no exception)
+ results.AssertAllPassed();
+ }
+
+ [Fact]
+ public void AgentEvaluationResults_NumericMetric_HighScorePasses()
+ {
+ // Arrange
+ var evalResult = new EvaluationResult();
+ evalResult.Metrics["relevance"] = new NumericMetric("relevance", 4.5);
+
+ // Act
+ var results = new AgentEvaluationResults("test", new[] { evalResult });
+
+ // Assert
+ Assert.True(results.AllPassed);
+ }
+
+ [Fact]
+ public void AgentEvaluationResults_NumericMetric_LowScoreFails()
+ {
+ // Arrange
+ var evalResult = new EvaluationResult();
+ evalResult.Metrics["relevance"] = new NumericMetric("relevance", 2.0);
+
+ // Act
+ var results = new AgentEvaluationResults("test", new[] { evalResult });
+
+ // Assert
+ Assert.False(results.AllPassed);
+ }
+
+ [Fact]
+ public void AgentEvaluationResults_SubResults_AllPassedChecksChildren()
+ {
+ // Arrange
+ var passResult = new EvaluationResult();
+ passResult.Metrics["check"] = new BooleanMetric("check", true)
+ {
+ Interpretation = new EvaluationMetricInterpretation
+ {
+ Rating = EvaluationRating.Good,
+ Failed = false,
+ },
+ };
+
+ var failResult = new EvaluationResult();
+ failResult.Metrics["check"] = new BooleanMetric("check", false)
+ {
+ Interpretation = new EvaluationMetricInterpretation
+ {
+ Rating = EvaluationRating.Unacceptable,
+ Failed = true,
+ },
+ };
+
+ var results = new AgentEvaluationResults("test", Array.Empty())
+ {
+ SubResults = new Dictionary
+ {
+ ["agent1"] = new("test", new[] { passResult }),
+ ["agent2"] = new("test", new[] { failResult }),
+ },
+ };
+
+ // Assert
+ Assert.False(results.AllPassed);
+ }
+
+ // ---------------------------------------------------------------
+ // Mixed evaluator tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public async Task LocalEvaluator_MixedChecks_ReportsCorrectCountsAsync()
+ {
+ // Arrange
+ var evaluator = new LocalEvaluator(
+ EvalChecks.KeywordCheck("weather"),
+ EvalChecks.KeywordCheck("snow"),
+ FunctionEvaluator.Create("is_long", (string r) => r.Length > 5));
+
+ var items = new List { CreateItem() };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert
+ Assert.Equal(1, results.Total);
+
+ // One item with 3 checks: "weather" passes, "snow" fails, "is_long" passes
+ // The item has one failed metric so it should count as failed
+ Assert.Equal(0, results.Passed);
+ Assert.Equal(1, results.Failed);
+ }
+
+ // ---------------------------------------------------------------
+ // Conversation Split tests
+ // ---------------------------------------------------------------
+
+ private static List CreateMultiTurnConversation()
+ {
+ return new List
+ {
+ new(ChatRole.User, "What's the weather in Seattle?"),
+ new(ChatRole.Assistant, "Seattle is 62°F and cloudy."),
+ new(ChatRole.User, "And Paris?"),
+ new(ChatRole.Assistant, "Paris is 68°F and partly sunny."),
+ new(ChatRole.User, "Compare them."),
+ new(ChatRole.Assistant, "Seattle is cooler; Paris is warmer and sunnier."),
+ };
+ }
+
+ [Fact]
+ public void Split_LastTurn_SplitsAtLastUserMessage()
+ {
+ // Arrange
+ var conversation = CreateMultiTurnConversation();
+ var item = new EvalItem("Compare them.", "Seattle is cooler; Paris is warmer and sunnier.", conversation);
+
+ // Act
+ var (query, response) = item.Split(ConversationSplitters.LastTurn);
+
+ // Assert — query includes everything up to and including "Compare them."
+ Assert.Equal(5, query.Count);
+ Assert.Equal(ChatRole.User, query[query.Count - 1].Role);
+ Assert.Contains("Compare", query[query.Count - 1].Text);
+
+ // Response is the final assistant message
+ Assert.Single(response);
+ Assert.Equal(ChatRole.Assistant, response[0].Role);
+ }
+
+ [Fact]
+ public void Split_Full_SplitsAtFirstUserMessage()
+ {
+ // Arrange
+ var conversation = CreateMultiTurnConversation();
+ var item = new EvalItem("What's the weather in Seattle?", "Full trajectory", conversation);
+
+ // Act
+ var (query, response) = item.Split(ConversationSplitters.Full);
+
+ // Assert — query is just the first user message
+ Assert.Single(query);
+ Assert.Contains("Seattle", query[0].Text);
+
+ // Response is everything after
+ Assert.Equal(5, response.Count);
+ }
+
+ [Fact]
+ public void Split_Full_IncludesSystemMessagesInQuery()
+ {
+ // Arrange
+ var conversation = new List
+ {
+ new(ChatRole.System, "You are a weather assistant."),
+ new(ChatRole.User, "What's the weather?"),
+ new(ChatRole.Assistant, "It's sunny."),
+ };
+
+ var item = new EvalItem("What's the weather?", "It's sunny.", conversation);
+
+ // Act
+ var (query, response) = item.Split(ConversationSplitters.Full);
+
+ // Assert — system message + first user message
+ Assert.Equal(2, query.Count);
+ Assert.Equal(ChatRole.System, query[0].Role);
+ Assert.Equal(ChatRole.User, query[1].Role);
+ Assert.Single(response);
+ }
+
+ [Fact]
+ public void Split_DefaultIsLastTurn()
+ {
+ // Arrange
+ var conversation = CreateMultiTurnConversation();
+ var item = new EvalItem("Compare them.", "response", conversation);
+
+ // Act — no split specified
+ var (query, response) = item.Split();
+
+ // Assert — same as LastTurn
+ Assert.Equal(5, query.Count);
+ Assert.Single(response);
+ }
+
+ [Fact]
+ public void Split_SplitterProperty_UsedWhenNoExplicitSplit()
+ {
+ // Arrange
+ var conversation = CreateMultiTurnConversation();
+ var item = new EvalItem("query", "response", conversation)
+ {
+ Splitter = ConversationSplitters.Full,
+ };
+
+ // Act — no explicit split, should use Splitter
+ var (query, response) = item.Split();
+
+ // Assert — Full split
+ Assert.Single(query);
+ Assert.Equal(5, response.Count);
+ }
+
+ [Fact]
+ public void Split_ExplicitSplitter_OverridesSplitterProperty()
+ {
+ // Arrange
+ var conversation = CreateMultiTurnConversation();
+ var item = new EvalItem("query", "response", conversation)
+ {
+ Splitter = ConversationSplitters.Full,
+ };
+
+ // Act — explicit LastTurn overrides Full
+ var (query, response) = item.Split(ConversationSplitters.LastTurn);
+
+ // Assert — LastTurn behavior
+ Assert.Equal(5, query.Count);
+ Assert.Single(response);
+ }
+
+ [Fact]
+ public void Split_WithToolMessages_PreservesToolPairs()
+ {
+ // Arrange
+ var conversation = new List
+ {
+ new(ChatRole.User, "What's the weather?"),
+ new(ChatRole.Assistant, new List
+ {
+ new FunctionCallContent("c1", "get_weather", new Dictionary { ["city"] = "Seattle" }),
+ }),
+ new(ChatRole.Tool, new List
+ {
+ new FunctionResultContent("c1", "62°F, cloudy"),
+ }),
+ new(ChatRole.Assistant, "Seattle is 62°F and cloudy."),
+ new(ChatRole.User, "Thanks!"),
+ new(ChatRole.Assistant, "You're welcome!"),
+ };
+
+ var item = new EvalItem("Thanks!", "You're welcome!", conversation);
+
+ // Act
+ var (query, response) = item.Split(ConversationSplitters.LastTurn);
+
+ // Assert — tool messages stay in query context
+ Assert.Equal(5, query.Count);
+ Assert.Equal(ChatRole.Tool, query[2].Role);
+ Assert.Single(response);
+ }
+
+ [Fact]
+ public void ConversationSplitters_LastTurn_CanBeUsedAsCustomFallback()
+ {
+ // Arrange
+ var conversation = CreateMultiTurnConversation();
+
+ // Act — use ConversationSplitters.LastTurn directly
+ var (query, response) = ConversationSplitters.LastTurn.Split(conversation);
+
+ // Assert
+ Assert.Equal(5, query.Count);
+ Assert.Single(response);
+ }
+
+ // ---------------------------------------------------------------
+ // PerTurnItems tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void PerTurnItems_SplitsMultiTurnConversation()
+ {
+ // Arrange
+ var conversation = CreateMultiTurnConversation();
+
+ // Act
+ var items = EvalItem.PerTurnItems(conversation);
+
+ // Assert — 3 user messages = 3 items
+ Assert.Equal(3, items.Count);
+
+ // First turn: "What's the weather in Seattle?"
+ Assert.Contains("Seattle", items[0].Query);
+ Assert.Contains("62°F", items[0].Response);
+ Assert.Equal(2, items[0].Conversation.Count);
+
+ // Second turn: "And Paris?"
+ Assert.Contains("Paris", items[1].Query);
+ Assert.Contains("68°F", items[1].Response);
+ Assert.Equal(4, items[1].Conversation.Count);
+
+ // Third turn: "Compare them."
+ Assert.Contains("Compare", items[2].Query);
+ Assert.Contains("cooler", items[2].Response);
+ Assert.Equal(6, items[2].Conversation.Count);
+ }
+
+ [Fact]
+ public void PerTurnItems_PropagatesToolsAndContext()
+ {
+ // Arrange
+ var conversation = CreateMultiTurnConversation();
+
+ // Act
+ var items = EvalItem.PerTurnItems(
+ conversation,
+ context: "Weather database");
+
+ // Assert
+ Assert.All(items, item => Assert.Equal("Weather database", item.Context));
+ }
+
+ [Fact]
+ public void PerTurnItems_SingleTurn_ReturnsOneItem()
+ {
+ // Arrange
+ var conversation = new List
+ {
+ new(ChatRole.User, "Hello"),
+ new(ChatRole.Assistant, "Hi there!"),
+ };
+
+ // Act
+ var items = EvalItem.PerTurnItems(conversation);
+
+ // Assert
+ Assert.Single(items);
+ Assert.Equal("Hello", items[0].Query);
+ Assert.Equal("Hi there!", items[0].Response);
+ }
+
+ // ---------------------------------------------------------------
+ // Custom IConversationSplitter tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void Split_CustomSplitter_IsUsed()
+ {
+ // Arrange — splitter that splits before a tool call message
+ var conversation = new List
+ {
+ new(ChatRole.User, "Remember this"),
+ new(ChatRole.Assistant, "Storing..."),
+ new(ChatRole.User, "What did I say?"),
+ new(ChatRole.Assistant, new List
+ {
+ new FunctionCallContent("c1", "retrieve_memory"),
+ }),
+ new(ChatRole.Tool, new List
+ {
+ new FunctionResultContent("c1", "You said: Remember this"),
+ }),
+ new(ChatRole.Assistant, "You said 'Remember this'."),
+ };
+
+ var splitter = new MemorySplitter();
+ var item = new EvalItem("What did I say?", "You said 'Remember this'.", conversation);
+
+ // Act
+ var (query, response) = item.Split(splitter);
+
+ // Assert — split before the tool call
+ Assert.Equal(3, query.Count);
+ Assert.Equal(3, response.Count);
+ }
+
+ [Fact]
+ public void Split_CustomSplitter_WorksAsItemProperty()
+ {
+ // Arrange — custom splitter set on the item (simulating call-site override)
+ var conversation = new List
+ {
+ new(ChatRole.User, "Remember this"),
+ new(ChatRole.Assistant, "Storing..."),
+ new(ChatRole.User, "What did I say?"),
+ new(ChatRole.Assistant, new List
+ {
+ new FunctionCallContent("c1", "retrieve_memory"),
+ }),
+ new(ChatRole.Tool, new List
+ {
+ new FunctionResultContent("c1", "You said: Remember this"),
+ }),
+ new(ChatRole.Assistant, "You said 'Remember this'."),
+ };
+
+ var item = new EvalItem("What did I say?", "You said 'Remember this'.", conversation)
+ {
+ Splitter = new MemorySplitter(),
+ };
+
+ // Act — no explicit splitter, uses item.Splitter
+ var (query, response) = item.Split();
+
+ // Assert — custom splitter was used
+ Assert.Equal(3, query.Count);
+ Assert.Equal(3, response.Count);
+ }
+
+ private sealed class MemorySplitter : IConversationSplitter
+ {
+ public (IReadOnlyList QueryMessages, IReadOnlyList ResponseMessages) Split(
+ IReadOnlyList conversation)
+ {
+ for (int i = 0; i < conversation.Count; i++)
+ {
+ var msg = conversation[i];
+ if (msg.Role == ChatRole.Assistant && msg.Contents != null)
+ {
+ foreach (var content in msg.Contents)
+ {
+ if (content is FunctionCallContent fc && fc.Name == "retrieve_memory")
+ {
+ return (
+ conversation.Take(i).ToList(),
+ conversation.Skip(i).ToList());
+ }
+ }
+ }
+ }
+
+ // Fallback to last-turn split
+ return ConversationSplitters.LastTurn.Split(conversation);
+ }
+ }
+
+ // ---------------------------------------------------------------
+ // ExpectedToolCall tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void ExpectedToolCall_NameOnly()
+ {
+ var tc = new ExpectedToolCall("get_weather");
+ Assert.Equal("get_weather", tc.Name);
+ Assert.Null(tc.Arguments);
+ }
+
+ [Fact]
+ public void ExpectedToolCall_NameAndArgs()
+ {
+ var args = new Dictionary { ["location"] = "NYC" };
+ var tc = new ExpectedToolCall("get_weather", args);
+ Assert.Equal("get_weather", tc.Name);
+ Assert.NotNull(tc.Arguments);
+ Assert.Equal("NYC", tc.Arguments["location"]);
+ }
+
+ [Fact]
+ public void EvalItem_ExpectedToolCalls_DefaultNull()
+ {
+ var item = CreateItem();
+ Assert.Null(item.ExpectedToolCalls);
+ }
+
+ [Fact]
+ public void EvalItem_ExpectedToolCalls_CanBeSet()
+ {
+ var item = CreateItem();
+ item.ExpectedToolCalls = new List
+ {
+ new("get_weather", new Dictionary { ["location"] = "NYC" }),
+ new("book_flight"),
+ };
+
+ Assert.NotNull(item.ExpectedToolCalls);
+ Assert.Equal(2, item.ExpectedToolCalls.Count);
+ Assert.Equal("get_weather", item.ExpectedToolCalls[0].Name);
+ Assert.Null(item.ExpectedToolCalls[1].Arguments);
+ }
+
+ [Fact]
+ public async Task LocalEvaluator_PopulatesInputItems_ForAuditingAsync()
+ {
+ // Arrange
+ var check = FunctionEvaluator.Create("is_sunny",
+ (string response) => response.Contains("sunny", StringComparison.OrdinalIgnoreCase));
+
+ var evaluator = new LocalEvaluator(check);
+ var items = new List
+ {
+ CreateItem(query: "Weather?", response: "It's sunny!"),
+ CreateItem(query: "Temp?", response: "72 degrees"),
+ };
+
+ // Act
+ var results = await evaluator.EvaluateAsync(items);
+
+ // Assert — InputItems carries the original query/response for auditing
+ Assert.NotNull(results.InputItems);
+ Assert.Equal(2, results.InputItems.Count);
+ Assert.Equal("Weather?", results.InputItems[0].Query);
+ Assert.Equal("It's sunny!", results.InputItems[0].Response);
+ Assert.Equal("Temp?", results.InputItems[1].Query);
+ Assert.Equal("72 degrees", results.InputItems[1].Response);
+
+ // Results and InputItems are positionally correlated
+ Assert.Equal(results.Items.Count, results.InputItems.Count);
+ }
+
+ // ---------------------------------------------------------------
+ // AgentEvaluationResults tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void AllPassed_EmptyItems_NoSubResults_ReturnsFalseAsync()
+ {
+ var results = new AgentEvaluationResults("test", Array.Empty());
+ Assert.False(results.AllPassed);
+ Assert.Equal(0, results.Total);
+ }
+
+ [Fact]
+ public void AllPassed_SubResultsAllPass_OverallFails_ReturnsFalseAsync()
+ {
+ // Overall has a failing item
+ var failMetric = new BooleanMetric("check", false)
+ {
+ Interpretation = new EvaluationMetricInterpretation
+ {
+ Rating = EvaluationRating.Unacceptable,
+ Failed = true,
+ },
+ };
+ var failResult = new EvaluationResult();
+ failResult.Metrics["check"] = failMetric;
+
+ var overall = new AgentEvaluationResults("test", new[] { failResult });
+
+ // Sub-results all pass
+ var passMetric = new BooleanMetric("check", true)
+ {
+ Interpretation = new EvaluationMetricInterpretation
+ {
+ Rating = EvaluationRating.Good,
+ Failed = false,
+ },
+ };
+ var passResult = new EvaluationResult();
+ passResult.Metrics["check"] = passMetric;
+
+ overall.SubResults = new Dictionary
+ {
+ ["agent1"] = new AgentEvaluationResults("sub", new[] { passResult }),
+ };
+
+ // Overall has a failing item, so AllPassed should be false
+ Assert.False(overall.AllPassed);
+ }
+
+ // ---------------------------------------------------------------
+ // BuildItemsFromResponses validation tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void BuildEvalItem_SetsPropertiesCorrectly()
+ {
+ var userMsg = new ChatMessage(ChatRole.User, "test query");
+ var assistantMsg = new ChatMessage(ChatRole.Assistant, "response");
+ var inputMessages = new List { userMsg };
+ var response = new AgentResponse(assistantMsg);
+
+ var item = AgentEvaluationExtensions.BuildEvalItem("test query", response, inputMessages, null!);
+
+ Assert.Equal("test query", item.Query);
+ Assert.NotNull(item.RawResponse);
+ }
+
+ [Fact]
+ public void BuildEvalItem_DoesNotMutateInputMessages()
+ {
+ // Arrange
+ var userMsg = new ChatMessage(ChatRole.User, "hello");
+ var assistantMsg = new ChatMessage(ChatRole.Assistant, "world");
+ var inputMessages = new List { userMsg };
+ var response = new AgentResponse(assistantMsg);
+
+ // Act
+ var item = AgentEvaluationExtensions.BuildEvalItem("hello", response, inputMessages, null!);
+
+ // Assert — input list is not mutated
+ Assert.Single(inputMessages);
+ Assert.Equal(userMsg, inputMessages[0]);
+
+ // But the EvalItem's conversation includes the response message
+ Assert.Equal(2, item.Conversation.Count);
+ }
+
+ // ---------------------------------------------------------------
+ // BuildItemsFromResponses validation tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void BuildItemsFromResponses_MismatchedQueryAndResponseCount_Throws()
+ {
+ var queries = new[] { "q1", "q2" };
+ var responses = new[] { new AgentResponse(new ChatMessage(ChatRole.Assistant, "a1")) };
+
+ var ex = Assert.Throws(
+ () => AgentEvaluationExtensions.BuildItemsFromResponses(null!, responses, queries, null, null));
+ Assert.Contains("queries", ex.Message);
+ Assert.Contains("responses", ex.Message);
+ }
+
+ [Fact]
+ public void BuildItemsFromResponses_MismatchedExpectedOutput_Throws()
+ {
+ var queries = new[] { "q1" };
+ var responses = new[] { new AgentResponse(new ChatMessage(ChatRole.Assistant, "a1")) };
+ var expectedOutput = new[] { "e1", "e2" };
+
+ var ex = Assert.Throws(
+ () => AgentEvaluationExtensions.BuildItemsFromResponses(null!, responses, queries, expectedOutput, null));
+ Assert.Contains("expectedOutput", ex.Message);
+ }
+
+ [Fact]
+ public void BuildItemsFromResponses_MismatchedExpectedToolCalls_Throws()
+ {
+ var queries = new[] { "q1" };
+ var responses = new[] { new AgentResponse(new ChatMessage(ChatRole.Assistant, "a1")) };
+ var expectedToolCalls = new[] { new[] { new ExpectedToolCall("t1") }, new[] { new ExpectedToolCall("t2") } };
+
+ var ex = Assert.Throws(
+ () => AgentEvaluationExtensions.BuildItemsFromResponses(
+ null!, responses, queries, null, expectedToolCalls));
+ Assert.Contains("expectedToolCalls", ex.Message);
+ }
+
+ // ---------------------------------------------------------------
+ // FoundryEvals.BuildEvaluators tests
+ // ---------------------------------------------------------------
+
+ [Fact]
+ public void BuildEvaluators_QualityNames_ReturnsDistinctEvaluators()
+ {
+ var evaluators = AzureAI.FoundryEvals.BuildEvaluators(
+ new[] { AzureAI.FoundryEvals.Relevance, AzureAI.FoundryEvals.Coherence });
+
+ Assert.Equal(2, evaluators.Count);
+ }
+
+ [Fact]
+ public void BuildEvaluators_MultipleSafetyNames_SingleContentHarmEvaluator()
+ {
+ var evaluators = AzureAI.FoundryEvals.BuildEvaluators(
+ new[]
+ {
+ AzureAI.FoundryEvals.Violence,
+ AzureAI.FoundryEvals.Sexual,
+ AzureAI.FoundryEvals.SelfHarm,
+ AzureAI.FoundryEvals.HateUnfairness,
+ });
+
+ // All four safety names produce exactly one ContentHarmEvaluator
+ Assert.Single(evaluators);
+ }
+
+ [Fact]
+ public void BuildEvaluators_UnknownName_ThrowsArgumentException()
+ {
+ var names = new[] { "gobblygook" };
+ var ex = Assert.Throws(
+ () => AzureAI.FoundryEvals.BuildEvaluators(names));
+ Assert.Contains("gobblygook", ex.Message);
+ Assert.Contains("not supported", ex.Message, StringComparison.OrdinalIgnoreCase);
+ }
+
+ [Fact]
+ public void BuildEvaluators_DefaultSelection_ReturnsRelevanceAndCoherence()
+ {
+ // Default evaluator names when constructor receives empty array
+ var defaults = new[] { AzureAI.FoundryEvals.Relevance, AzureAI.FoundryEvals.Coherence };
+ var evaluators = AzureAI.FoundryEvals.BuildEvaluators(defaults);
+
+ Assert.Equal(2, evaluators.Count);
+ }
+}
diff --git a/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj b/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj
index ffa4417f34..8e1dba18bd 100644
--- a/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj
+++ b/dotnet/tests/Microsoft.Agents.AI.UnitTests/Microsoft.Agents.AI.UnitTests.csproj
@@ -13,6 +13,16 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj
index 58979a4f1b..6adedab6c3 100644
--- a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj
+++ b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/Microsoft.Agents.AI.Workflows.UnitTests.csproj
@@ -4,6 +4,11 @@
$(NoWarn);MEAI001
+
+
+
+
+
diff --git a/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs
new file mode 100644
index 0000000000..1ab7e71a82
--- /dev/null
+++ b/dotnet/tests/Microsoft.Agents.AI.Workflows.UnitTests/WorkflowEvaluationTests.cs
@@ -0,0 +1,156 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Collections.Generic;
+
+namespace Microsoft.Agents.AI.Workflows.UnitTests;
+
+///
+/// Tests for .
+///
+public sealed class WorkflowEvaluationTests
+{
+ [Fact]
+ public void ExtractAgentData_EmptyEvents_ReturnsEmpty()
+ {
+ var result = WorkflowEvaluationExtensions.ExtractAgentData(new List(), splitter: null);
+
+ Assert.Empty(result);
+ }
+
+ [Fact]
+ public void ExtractAgentData_MatchedPair_ReturnsItem()
+ {
+ var events = new List
+ {
+ new ExecutorInvokedEvent("agent-1", "What is the weather?"),
+ new ExecutorCompletedEvent("agent-1", "It's sunny."),
+ };
+
+ var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null);
+
+ Assert.Single(result);
+ Assert.True(result.ContainsKey("agent-1"));
+ Assert.Single(result["agent-1"]);
+ Assert.Equal("What is the weather?", result["agent-1"][0].Query);
+ Assert.Equal("It's sunny.", result["agent-1"][0].Response);
+ Assert.Equal(2, result["agent-1"][0].Conversation.Count);
+ }
+
+ [Fact]
+ public void ExtractAgentData_UnmatchedInvocation_NotIncluded()
+ {
+ // An invocation without a matching completion should not appear in results
+ var events = new List
+ {
+ new ExecutorInvokedEvent("agent-1", "Hello"),
+ };
+
+ var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null);
+
+ Assert.Empty(result);
+ }
+
+ [Fact]
+ public void ExtractAgentData_CompletionWithoutInvocation_NotIncluded()
+ {
+ // A completion without a prior invocation should not appear in results
+ var events = new List
+ {
+ new ExecutorCompletedEvent("agent-1", "Response"),
+ };
+
+ var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null);
+
+ Assert.Empty(result);
+ }
+
+ [Fact]
+ public void ExtractAgentData_MultipleAgents_SeparatedByExecutorId()
+ {
+ var events = new List
+ {
+ new ExecutorInvokedEvent("agent-1", "Q1"),
+ new ExecutorInvokedEvent("agent-2", "Q2"),
+ new ExecutorCompletedEvent("agent-1", "A1"),
+ new ExecutorCompletedEvent("agent-2", "A2"),
+ };
+
+ var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null);
+
+ Assert.Equal(2, result.Count);
+ Assert.Equal("Q1", result["agent-1"][0].Query);
+ Assert.Equal("A1", result["agent-1"][0].Response);
+ Assert.Equal("Q2", result["agent-2"][0].Query);
+ Assert.Equal("A2", result["agent-2"][0].Response);
+ }
+
+ [Fact]
+ public void ExtractAgentData_DuplicateExecutorId_LastInvocationUsed()
+ {
+ // If the same executor is invoked twice before completing,
+ // the second invocation overwrites the first
+ var events = new List
+ {
+ new ExecutorInvokedEvent("agent-1", "First question"),
+ new ExecutorInvokedEvent("agent-1", "Second question"),
+ new ExecutorCompletedEvent("agent-1", "Answer"),
+ };
+
+ var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null);
+
+ Assert.Single(result);
+ Assert.Single(result["agent-1"]);
+ Assert.Equal("Second question", result["agent-1"][0].Query);
+ }
+
+ [Fact]
+ public void ExtractAgentData_MultipleRoundsForSameExecutor_AllCaptured()
+ {
+ // Same executor invoked→completed twice (sequential rounds)
+ var events = new List
+ {
+ new ExecutorInvokedEvent("agent-1", "Q1"),
+ new ExecutorCompletedEvent("agent-1", "A1"),
+ new ExecutorInvokedEvent("agent-1", "Q2"),
+ new ExecutorCompletedEvent("agent-1", "A2"),
+ };
+
+ var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null);
+
+ Assert.Single(result); // one executor
+ Assert.Equal(2, result["agent-1"].Count); // two items
+ Assert.Equal("Q1", result["agent-1"][0].Query);
+ Assert.Equal("Q2", result["agent-1"][1].Query);
+ }
+
+ [Fact]
+ public void ExtractAgentData_NullData_UsesEmptyString()
+ {
+ var events = new List
+ {
+ new ExecutorInvokedEvent("agent-1", null!),
+ new ExecutorCompletedEvent("agent-1", null),
+ };
+
+ var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter: null);
+
+ Assert.Single(result);
+ Assert.Equal(string.Empty, result["agent-1"][0].Query);
+ Assert.Equal(string.Empty, result["agent-1"][0].Response);
+ }
+
+ [Fact]
+ public void ExtractAgentData_WithSplitter_SetOnItems()
+ {
+ var splitter = ConversationSplitters.LastTurn;
+ var events = new List
+ {
+ new ExecutorInvokedEvent("agent-1", "Q"),
+ new ExecutorCompletedEvent("agent-1", "A"),
+ };
+
+ var result = WorkflowEvaluationExtensions.ExtractAgentData(events, splitter);
+
+ Assert.Equal(splitter, result["agent-1"][0].Splitter);
+ }
+}