|
| 1 | +--- |
| 2 | +status: accepted |
| 3 | +contact: sergeymenshykh |
| 4 | +date: 2025-02-05 |
| 5 | +deciders: dmytrostruk, markwallace, rbarreto, sergeymenshykh, westey-m, |
| 6 | +--- |
| 7 | + |
| 8 | +# Hybrid Model Orchestration |
| 9 | + |
| 10 | +## Context and Problem Statement |
| 11 | +Taking into account the constantly emerging and improving local and cloud-based models, in addition to the growing demand for utilizing local AI models running on local devices' NPUs, |
| 12 | +AI powered applications need to be able to effectively and seamlessly leverage both local and cloud models for inference to achieve the best AI user experience. |
| 13 | + |
| 14 | +## Decision Drivers |
| 15 | + |
| 16 | +1. The model orchestration layer should be simple and extensible. |
| 17 | +2. The model orchestration layer client code should not be aware of or deal with the underlying complexities. |
| 18 | +3. The model orchestration layer should allow for different strategies for selecting the best model(s) for the task at hand. |
| 19 | + |
| 20 | +## Considered Implementation Options |
| 21 | + |
| 22 | +The following options consider a few ways to implement the model orchestration layer. |
| 23 | + |
| 24 | +### Option 1: IChatClient implementation per orchestration strategy |
| 25 | + |
| 26 | +This option presents a simple and straightforward approach to implementing the model orchestration layer. Each strategy is implemented as a separate implementation of the IChatClient interface. |
| 27 | + |
| 28 | +For example, a fallback strategy that uses the first configured chat client for inference and falls back to the next one if the AI model is not available may be implemented as follows: |
| 29 | +```csharp |
| 30 | +public sealed class FallbackChatClient : IChatClient |
| 31 | +{ |
| 32 | + private readonly IChatClient[] _clients; |
| 33 | + |
| 34 | + public FallbackChatClient(params IChatClient[] clients) |
| 35 | + { |
| 36 | + this._clients = clients; |
| 37 | + } |
| 38 | + |
| 39 | + public Task<Microsoft.Extensions.AI.ChatCompletion> CompleteAsync(IList<ChatMessage> chatMessages, ChatOptions? options = null, CancellationToken cancellationToken = default) |
| 40 | + { |
| 41 | + foreach (var client in this._clients) |
| 42 | + { |
| 43 | + try |
| 44 | + { |
| 45 | + return client.CompleteAsync(chatMessages, options, cancellationToken); |
| 46 | + } |
| 47 | + catch (HttpRequestException ex) |
| 48 | + { |
| 49 | + if (ex.StatusCode >= 500) |
| 50 | + { |
| 51 | + // Try the next client |
| 52 | + continue; |
| 53 | + } |
| 54 | + |
| 55 | + throw; |
| 56 | + } |
| 57 | + } |
| 58 | + } |
| 59 | + |
| 60 | + public IAsyncEnumerable<StreamingChatCompletionUpdate> CompleteStreamingAsync(IList<ChatMessage> chatMessages, ChatOptions? options = null, CancellationToken cancellationToken = default) |
| 61 | + { |
| 62 | + ... |
| 63 | + } |
| 64 | + |
| 65 | + public void Dispose() { /*We can't dispose clients here because they can be used up the stack*/ } |
| 66 | + |
| 67 | + public ChatClientMetadata Metadata => new ChatClientMetadata(); |
| 68 | + |
| 69 | + public object? GetService(Type serviceType, object? serviceKey = null) => null; |
| 70 | +} |
| 71 | +``` |
| 72 | + |
| 73 | +Other orchestration strategies, such as latency-based or token-based strategies, can be implemented in a similar way: a class that implements the IChatClient interface and the corresponding chat client selection strategy. |
| 74 | + |
| 75 | +Pros: |
| 76 | +- Does not require any new abstraction. |
| 77 | +- Simple and straightforward implementation. |
| 78 | +- Can be sufficient for most use cases. |
| 79 | + |
| 80 | +### Option 2: HybridChatClient class with chat completion handler(s) per orchestration strategy |
| 81 | + |
| 82 | +This option introduces a HybridChatClient class that implements the IChatClient interface and delegates the selection routine to a provided handler represented by the abstract ChatCompletionHandler class: |
| 83 | +```csharp |
| 84 | +public sealed class HybridChatClient : IChatClient |
| 85 | +{ |
| 86 | + private readonly IChatClient[] _chatClients; |
| 87 | + private readonly ChatCompletionHandler _handler; |
| 88 | + private readonly Kernel? _kernel; |
| 89 | + |
| 90 | + public HybridChatClient(IChatClient[] chatClients, ChatCompletionHandler handler, Kernel? kernel = null) |
| 91 | + { |
| 92 | + this._chatClients = chatClients; |
| 93 | + this._handler = handler; |
| 94 | + this._kernel = kernel; |
| 95 | + } |
| 96 | + |
| 97 | + public Task<Extensions.AI.ChatCompletion> CompleteAsync(IList<ChatMessage> chatMessages, ChatOptions? options = null, CancellationToken cancellationToken = default) |
| 98 | + { |
| 99 | + return this._handler.CompleteAsync( |
| 100 | + new ChatCompletionHandlerContext |
| 101 | + { |
| 102 | + ChatMessages = chatMessages, |
| 103 | + Options = options, |
| 104 | + ChatClients = this._chatClients.ToDictionary(c => c, c => (CompletionContext?)null), |
| 105 | + Kernel = this._kernel, |
| 106 | + }, cancellationToken); |
| 107 | + } |
| 108 | + |
| 109 | + public IAsyncEnumerable<StreamingChatCompletionUpdate> CompleteStreamingAsync(IList<ChatMessage> chatMessages, ChatOptions? options = null, CancellationToken cancellationToken = default) |
| 110 | + { |
| 111 | + ... |
| 112 | + } |
| 113 | + |
| 114 | + ... |
| 115 | +} |
| 116 | + |
| 117 | +public abstract class ChatCompletionHandler |
| 118 | +{ |
| 119 | + public abstract Task<Extensions.AI.ChatCompletion> CompleteAsync(ChatCompletionHandlerContext context, CancellationToken cancellationToken = default); |
| 120 | + |
| 121 | + public abstract IAsyncEnumerable<StreamingChatCompletionUpdate> CompleteStreamingAsync(ChatCompletionHandlerContext context, CancellationToken cancellationToken = default); |
| 122 | +} |
| 123 | +``` |
| 124 | + |
| 125 | +The HybridChatClient class passes all the necessary information to the handler via the ChatCompletionHandlerContext class, which contains the list of chat clients, chat messages, options, and Kernel instance. |
| 126 | +```csharp |
| 127 | +public class ChatCompletionHandlerContext |
| 128 | +{ |
| 129 | + public IDictionary<IChatClient, CompletionContext?> ChatClients { get; init; } |
| 130 | + |
| 131 | + public IList<ChatMessage> ChatMessages { get; init; } |
| 132 | + |
| 133 | + public ChatOptions? Options { get; init; } |
| 134 | + |
| 135 | + public Kernel? Kernel { get; init; } |
| 136 | +} |
| 137 | +``` |
| 138 | + |
| 139 | +The fallback strategy shown in the previous option can be implemented as the following handler: |
| 140 | +```csharp |
| 141 | +public class FallbackChatCompletionHandler : ChatCompletionHandler |
| 142 | +{ |
| 143 | + public override async Task<Extensions.AI.ChatCompletion> CompleteAsync(ChatCompletionHandlerContext context, CancellationToken cancellationToken = default) |
| 144 | + { |
| 145 | + for (int i = 0; i < context.ChatClients.Count; i++) |
| 146 | + { |
| 147 | + var chatClient = context.ChatClients.ElementAt(i).Key; |
| 148 | + |
| 149 | + try |
| 150 | + { |
| 151 | + return client.CompleteAsync(chatMessages, options, cancellationToken); |
| 152 | + } |
| 153 | + catch (HttpRequestException ex) |
| 154 | + { |
| 155 | + if (ex.StatusCode >= 500) |
| 156 | + { |
| 157 | + // Try the next client |
| 158 | + continue; |
| 159 | + } |
| 160 | + |
| 161 | + throw; |
| 162 | + } |
| 163 | + } |
| 164 | + |
| 165 | + throw new InvalidOperationException("No client provided for chat completion."); |
| 166 | + } |
| 167 | + |
| 168 | + public override async IAsyncEnumerable<StreamingChatCompletionUpdate> CompleteStreamingAsync(ChatCompletionHandlerContext context, CancellationToken cancellationToken = default) |
| 169 | + { |
| 170 | + ... |
| 171 | + } |
| 172 | +} |
| 173 | +``` |
| 174 | + |
| 175 | +and the caller code would look like this: |
| 176 | +```csharp |
| 177 | +IChatClient onnxChatClient = new OnnxChatClient(...); |
| 178 | + |
| 179 | +IChatClient openAIChatClient = new OpenAIChatClient(...); |
| 180 | + |
| 181 | +// Tries the first client and falls back to the next one if the first one fails |
| 182 | +FallbackChatCompletionHandler handler = new FallbackChatCompletionHandler(...); |
| 183 | + |
| 184 | +IChatClient hybridChatClient = new HybridChatClient([onnxChatClient, openAIChatClient], handler); |
| 185 | + |
| 186 | +... |
| 187 | + |
| 188 | +var result = await hybridChatClient.CompleteAsync("Do I need an umbrella?", ...); |
| 189 | +``` |
| 190 | + |
| 191 | +The handlers can be chained to create more complex scenarios, where a handler performs some preprocessing and then delegates the call to another handler with an augmented chat clients list. |
| 192 | + |
| 193 | +For example, the first handler identifies that a cloud model has requested access to sensitive data and delegates the call handling to local models to process it. |
| 194 | + |
| 195 | +```csharp |
| 196 | +IChatClient onnxChatClient = new OnnxChatClient(...); |
| 197 | + |
| 198 | +IChatClient llamaChatClient = new LlamaChatClient(...); |
| 199 | + |
| 200 | +IChatClient openAIChatClient = new OpenAIChatClient(...); |
| 201 | + |
| 202 | +// Tries the first client and falls back to the next one if the first one fails |
| 203 | +FallbackChatCompletionHandler fallbackHandler = new FallbackChatCompletionHandler(...); |
| 204 | + |
| 205 | +// Check if the request contains sensitive data, identifies the client(s) allowed to work with the sensitive data, and delegates the call handling to the next handler. |
| 206 | +SensitiveDataHandler sensitiveDataHandler = new SensitiveDataHandler(fallbackHandler); |
| 207 | + |
| 208 | +IChatClient hybridChatClient = new HybridChatClient(new[] { onnxChatClient, llamaChatClient, openAIChatClient }, sensitiveDataHandler); |
| 209 | + |
| 210 | +var result = await hybridChatClient.CompleteAsync("Do I need an umbrella?", ...); |
| 211 | +``` |
| 212 | + |
| 213 | +Examples of complex orchestration scenarios: |
| 214 | + |
| 215 | +| First Handler | Second Handler | Scenario Description | |
| 216 | +|---------------------------------------|--------------------------------|---------------------------------------------------------------------------| |
| 217 | +| InputTokenThresholdEvaluationHandler | FastestChatCompletionHandler | Identifies models based on the prompt's input token size and each model's min/max token capacity, then returns the fastest model's response. | |
| 218 | +| InputTokenThresholdEvaluationHandler | RelevancyChatCompletionHandler | Identifies models based on the prompt's input token size and each model's min/max token capacity, then returns the most relevant response. | |
| 219 | +| InputTokenThresholdEvaluationHandler | FallbackChatCompletionHandler | Identifies models based on the prompt's input token size and each model's min/max token capacity, then returns the first available model's response. | |
| 220 | +| SensitiveDataRoutingHandler | FastestChatCompletionHandler | Identifies models based on data sensitivity, then returns the fastest model's response. | |
| 221 | +| SensitiveDataRoutingHandler | RelevancyChatCompletionHandler | Identifies models based on data sensitivity, then returns the most relevant response. | |
| 222 | +| SensitiveDataRoutingHandler | FallbackChatCompletionHandler | Identifies models based on data sensitivity, then returns the first available model's response. | |
| 223 | + |
| 224 | +Pros: |
| 225 | +- Allows reusing same handlers to create various composite orchestration strategies. |
| 226 | + |
| 227 | +Cons: |
| 228 | +- Requires new abstractions and components than the previous option: context classes and code for handling the next handler. |
| 229 | + |
| 230 | +<br/> |
| 231 | + |
| 232 | +POC demonstrating this option can be found [here](https://github.com/microsoft/semantic-kernel/pull/10412). |
| 233 | + |
| 234 | +### Option 3: Implementing existing IAIServiceSelector interface. |
| 235 | + |
| 236 | +The Semantic Kernel has a mechanism that allows for the dynamic selection of AI services: |
| 237 | + |
| 238 | +```csharp |
| 239 | +public interface IAIServiceSelector |
| 240 | +{ |
| 241 | + bool TrySelectAIService<T>( |
| 242 | + Kernel kernel, |
| 243 | + KernelFunction function, |
| 244 | + KernelArguments arguments, |
| 245 | + [NotNullWhen(true)] out T? service, |
| 246 | + out PromptExecutionSettings? serviceSettings) where T : class, IAIService; |
| 247 | +} |
| 248 | +``` |
| 249 | + |
| 250 | +However, this mechanism requires specific context - the kernel, function, and arguments which may not always be available. |
| 251 | +Additionally, it only works with implementations of the IAIService interface, which may not be compatible with all AI services, |
| 252 | +such as those in Microsoft.Extensions.AI that implement the IChatClient interface. |
| 253 | + |
| 254 | +Furthermore, this mechanism cannot be used in orchestration scenarios where an AI service needs to be prompted first to determine its availability, latency, etc. |
| 255 | +For example, to check if an AI service is available, the selector would need to send chat messages with options to the service. It should then return |
| 256 | +the completion if the service is available, or fallback to another service if it is not. Given that the TrySelectAIService method does not accept a list of |
| 257 | +chat messages or options, it is impossible to send chat messages using this method. Even if it were possible, the consumer code would have to resend the same |
| 258 | +chat messages to the selected service to obtain a completion, as the selector does not return the completion itself. Additionally, the TrySelectAIService method |
| 259 | +is synchronous, making it difficult to send chat messages without using synchronous code, which is generally discouraged. |
| 260 | + |
| 261 | +Looking at the above, it is clear that the IAIServiceSelector interface is not suitable for the hybrid orchestration of AI services since it was designed for a different purpose: |
| 262 | +to synchronously select an instance of an AI service based on SK context and service metadata without taking the results of completion and streamed completion methods into account. |
| 263 | + |
| 264 | +Pros: |
| 265 | +- Reuses the existing mechanism for AI service selection. |
| 266 | + |
| 267 | +Cons: |
| 268 | +- Not suitable for all AI services. |
| 269 | +- Requires context that may not be available in all scenarios. |
| 270 | +- Consumer code must be aware of the IAIServiceSelector interface instead of simply using the IChatClient interface. |
| 271 | +- Synchronous method. |
| 272 | + |
| 273 | +## Decision Outcome |
| 274 | + |
| 275 | +Chosen option: Option 1 because it does not require any new abstraction; its simplicity and straightforwardness are sufficient for most use cases. |
| 276 | +Option 2 can be considered in the future if more complex orchestration scenarios are required. |
0 commit comments