diff --git a/src/RockBot.Agent/Program.cs b/src/RockBot.Agent/Program.cs index 3119d815..1899f4a0 100644 --- a/src/RockBot.Agent/Program.cs +++ b/src/RockBot.Agent/Program.cs @@ -1,4 +1,5 @@ using System.ClientModel; +using System.ClientModel.Primitives; using Microsoft.Extensions.AI; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; @@ -127,7 +128,10 @@ IChatClient BuildOpenAIClient(LlmTierConfig config) Endpoint = new Uri(config.Endpoint!), // Extend from the 100s default — subagents with large tool sets generate // longer responses that can exceed the default before the body is fully read. - NetworkTimeout = TimeSpan.FromMinutes(5) + NetworkTimeout = TimeSpan.FromMinutes(5), + // Disable SDK retry: the LlmGateway owns rate-limit retry policy. Without + // this, gateway and SDK both retry on 429 and silently double-retry. + RetryPolicy = new ClientRetryPolicy(maxRetries: 0) }) .GetChatClient(config.ModelId!).AsIChatClient(); } diff --git a/src/RockBot.Host.Abstractions/LlmGatewayOptions.cs b/src/RockBot.Host.Abstractions/LlmGatewayOptions.cs index 36b705cb..f2583676 100644 --- a/src/RockBot.Host.Abstractions/LlmGatewayOptions.cs +++ b/src/RockBot.Host.Abstractions/LlmGatewayOptions.cs @@ -27,4 +27,20 @@ public sealed class LlmGatewayOptions /// Expensive judgment calls; lower cap. /// public int HighMaxConcurrent { get; set; } = 2; + + /// + /// Maximum number of retry attempts on rate-limit (HTTP 429) responses before + /// the call surfaces the failure to the caller. Each retry honors any + /// Retry-After response header; in its absence, exponential backoff + /// (1s, 2s, 4s, 8s, ...) is used, capped by . + /// Set to zero to disable retry on rate-limit errors. + /// + public int MaxRateLimitRetries { get; set; } = 5; + + /// + /// Maximum backoff (in seconds) between retry attempts when no + /// Retry-After header is supplied by the provider. Caps the + /// exponential growth of fallback backoff. + /// + public int MaxBackoffSeconds { get; set; } = 16; } diff --git a/src/RockBot.Host/DefaultLlmRateLimitClassifier.cs b/src/RockBot.Host/DefaultLlmRateLimitClassifier.cs new file mode 100644 index 00000000..29a6bc21 --- /dev/null +++ b/src/RockBot.Host/DefaultLlmRateLimitClassifier.cs @@ -0,0 +1,67 @@ +using System.ClientModel; +using System.ClientModel.Primitives; +using System.Globalization; +using System.Net; + +namespace RockBot.Host; + +/// +/// Default : detects rate-limit errors +/// surfaced by the OpenAI SDK (via ) and +/// generic s carrying a 429 status. Walks the +/// exception chain so wrapped errors are caught. +/// +internal sealed class DefaultLlmRateLimitClassifier : ILlmRateLimitClassifier +{ + public bool TryClassify(Exception exception, out TimeSpan? retryAfter) + { + retryAfter = null; + + var current = exception; + while (current is not null) + { + if (current is ClientResultException cre && cre.Status == 429) + { + retryAfter = ParseRetryAfter(cre.GetRawResponse()); + return true; + } + + if (current is HttpRequestException hre && hre.StatusCode == HttpStatusCode.TooManyRequests) + { + // HttpRequestException does not carry response headers, so we + // cannot extract Retry-After here. The gateway falls back to + // exponential backoff. + return true; + } + + current = current.InnerException; + } + + return false; + } + + private static TimeSpan? ParseRetryAfter(PipelineResponse? response) + { + if (response is null) return null; + if (!response.Headers.TryGetValue("retry-after", out var raw) || string.IsNullOrEmpty(raw)) + return null; + + // Numeric form: integer seconds. + if (int.TryParse(raw, NumberStyles.Integer, CultureInfo.InvariantCulture, out var seconds) + && seconds >= 0) + { + return TimeSpan.FromSeconds(seconds); + } + + // HTTP-date form (RFC 7231). + if (DateTimeOffset.TryParse(raw, CultureInfo.InvariantCulture, + DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal, + out var when)) + { + var diff = when - DateTimeOffset.UtcNow; + return diff > TimeSpan.Zero ? diff : TimeSpan.Zero; + } + + return null; + } +} diff --git a/src/RockBot.Host/HostDiagnostics.cs b/src/RockBot.Host/HostDiagnostics.cs index 9b4f81d0..3fcf5272 100644 --- a/src/RockBot.Host/HostDiagnostics.cs +++ b/src/RockBot.Host/HostDiagnostics.cs @@ -58,6 +58,18 @@ public static class HostDiagnostics unit: "ms", description: "Time spent waiting for a per-tier LLM gateway slot"); + /// + /// Number of rate-limit retries performed by the LLM gateway. Tagged by tier + /// and by retry-after source ("header" when the provider supplied a + /// Retry-After hint, "backoff" when the gateway used its + /// exponential-backoff fallback). + /// + public static readonly Counter LlmGatewayRateLimitRetries = + Meter.CreateCounter( + "rockbot.llm.gateway.rate_limit_retries", + unit: "{retry}", + description: "Number of rate-limit (429) retries performed by the LLM gateway"); + // ── Agent turn metrics — recorded at architectural boundaries ───────────── /// Duration from user message receipt to final reply published. diff --git a/src/RockBot.Host/ILlmRateLimitClassifier.cs b/src/RockBot.Host/ILlmRateLimitClassifier.cs new file mode 100644 index 00000000..46799805 --- /dev/null +++ b/src/RockBot.Host/ILlmRateLimitClassifier.cs @@ -0,0 +1,22 @@ +namespace RockBot.Host; + +/// +/// Classifies exceptions thrown by underlying LLM SDK calls to determine whether +/// they represent a rate-limit (HTTP 429) condition that the gateway should retry. +/// +/// +/// Implementations walk the exception chain to find a rate-limit indicator and, +/// where possible, extract the provider's Retry-After hint so the gateway +/// can honor it precisely instead of falling back to exponential backoff. +/// Pluggable so different providers (OpenAI, Anthropic-direct, Copilot, etc.) +/// can surface their own rate-limit shapes. +/// +internal interface ILlmRateLimitClassifier +{ + /// + /// Returns true if indicates a rate-limit + /// condition that should be retried. is set to + /// the provider-supplied wait duration when available. + /// + bool TryClassify(Exception exception, out TimeSpan? retryAfter); +} diff --git a/src/RockBot.Host/LlmGateway.cs b/src/RockBot.Host/LlmGateway.cs index 595db073..0fb622fb 100644 --- a/src/RockBot.Host/LlmGateway.cs +++ b/src/RockBot.Host/LlmGateway.cs @@ -28,9 +28,15 @@ namespace RockBot.Host; internal sealed class LlmGateway : ILlmGateway, IDisposable { private readonly TierSlot[] _slots; + private readonly ILlmRateLimitClassifier _classifier; private readonly ILogger _logger; + private readonly int _maxRetries; + private readonly int _maxBackoffSeconds; - public LlmGateway(IOptions options, ILogger logger) + public LlmGateway( + IOptions options, + ILlmRateLimitClassifier classifier, + ILogger logger) { var opts = options.Value; @@ -53,11 +59,26 @@ public LlmGateway(IOptions options, ILogger logge _slots[(int)tier] = new TierSlot(cap); } + if (opts.MaxRateLimitRetries < 0) + throw new ArgumentOutOfRangeException( + nameof(options), + $"LlmGatewayOptions.MaxRateLimitRetries must be >= 0 (was {opts.MaxRateLimitRetries})."); + + if (opts.MaxBackoffSeconds < 1) + throw new ArgumentOutOfRangeException( + nameof(options), + $"LlmGatewayOptions.MaxBackoffSeconds must be >= 1 (was {opts.MaxBackoffSeconds})."); + + _classifier = classifier; _logger = logger; + _maxRetries = opts.MaxRateLimitRetries; + _maxBackoffSeconds = opts.MaxBackoffSeconds; _logger.LogInformation( - "LlmGateway: per-tier concurrency caps Low={Low} Balanced={Balanced} High={High}", - opts.LowMaxConcurrent, opts.BalancedMaxConcurrent, opts.HighMaxConcurrent); + "LlmGateway: per-tier concurrency caps Low={Low} Balanced={Balanced} High={High}, " + + "rate-limit retries Max={MaxRetries} backoff cap={MaxBackoff}s", + opts.LowMaxConcurrent, opts.BalancedMaxConcurrent, opts.HighMaxConcurrent, + _maxRetries, _maxBackoffSeconds); } /// @@ -99,7 +120,8 @@ public async Task ExecuteAsync( Interlocked.Increment(ref slot.InFlight); try { - return await operation(cancellationToken).ConfigureAwait(false); + return await ExecuteWithRetryAsync(tier, tierTag, operation, cancellationToken) + .ConfigureAwait(false); } finally { @@ -108,6 +130,65 @@ public async Task ExecuteAsync( } } + /// + /// Invokes and, on rate-limit (HTTP 429) failures, + /// retries up to MaxRateLimitRetries times. The slot is held throughout + /// — releasing during retry waits does not help, since rate limits are per-tier + /// so any other call in the same tier would hit the same limit. + /// + private async Task ExecuteWithRetryAsync( + ModelTier tier, + KeyValuePair tierTag, + Func> operation, + CancellationToken cancellationToken) + { + var attempt = 0; + while (true) + { + try + { + return await operation(cancellationToken).ConfigureAwait(false); + } + catch (Exception ex) when ( + attempt < _maxRetries + && !cancellationToken.IsCancellationRequested + && _classifier.TryClassify(ex, out var classifierRetryAfter)) + { + attempt++; + + var source = classifierRetryAfter.HasValue ? "header" : "backoff"; + var wait = classifierRetryAfter ?? ComputeBackoff(attempt); + + _logger.LogWarning( + "LlmGateway: rate-limit on tier {Tier} (attempt {Attempt}/{Max}); " + + "waiting {WaitSeconds}s ({Source}) before retry", + tier, attempt, _maxRetries, wait.TotalSeconds, source); + + HostDiagnostics.LlmGatewayRateLimitRetries.Add( + 1, + tierTag, + new KeyValuePair("rockbot.llm.gateway.retry_after_source", source)); + + try + { + await Task.Delay(wait, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + throw; + } + } + } + } + + private TimeSpan ComputeBackoff(int attempt) + { + // 1s, 2s, 4s, 8s, ..., capped at MaxBackoffSeconds. + // attempt is 1-based. + var seconds = Math.Min(Math.Pow(2, attempt - 1), _maxBackoffSeconds); + return TimeSpan.FromSeconds(seconds); + } + public void Dispose() { foreach (var slot in _slots) diff --git a/src/RockBot.Host/ServiceCollectionExtensions.cs b/src/RockBot.Host/ServiceCollectionExtensions.cs index 0de383d6..90b675a4 100644 --- a/src/RockBot.Host/ServiceCollectionExtensions.cs +++ b/src/RockBot.Host/ServiceCollectionExtensions.cs @@ -35,6 +35,7 @@ public static IServiceCollection AddRockBotHost( services.AddSingleton(); services.Configure(_ => { }); services.Configure(_ => { }); + services.AddSingleton(); services.AddSingleton(); services.AddTransient(); services.AddSingleton(); diff --git a/src/RockBot.ResearchAgent/Program.cs b/src/RockBot.ResearchAgent/Program.cs index f4079cf3..a5b23148 100644 --- a/src/RockBot.ResearchAgent/Program.cs +++ b/src/RockBot.ResearchAgent/Program.cs @@ -1,4 +1,5 @@ using System.ClientModel; +using System.ClientModel.Primitives; using Microsoft.Extensions.AI; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; @@ -49,7 +50,13 @@ IChatClient BuildClient(LlmTierConfig config) { return new OpenAIClient( new ApiKeyCredential(config.ApiKey!), - new OpenAIClientOptions { Endpoint = new Uri(config.Endpoint!) }) + new OpenAIClientOptions + { + Endpoint = new Uri(config.Endpoint!), + // Disable SDK retry: the LlmGateway owns rate-limit retry policy. + // Without this, gateway and SDK both retry on 429 and silently double-retry. + RetryPolicy = new ClientRetryPolicy(maxRetries: 0) + }) .GetChatClient(config.ModelId!).AsIChatClient(); } diff --git a/src/RockBot.SampleAgent/Program.cs b/src/RockBot.SampleAgent/Program.cs index 78c9136f..1343f0fb 100644 --- a/src/RockBot.SampleAgent/Program.cs +++ b/src/RockBot.SampleAgent/Program.cs @@ -1,4 +1,5 @@ using System.ClientModel; +using System.ClientModel.Primitives; using Microsoft.Extensions.AI; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; @@ -26,7 +27,12 @@ { var openAiClient = new OpenAIClient( new ApiKeyCredential(apiKey), - new OpenAIClientOptions { Endpoint = new Uri(endpoint) }); + new OpenAIClientOptions + { + Endpoint = new Uri(endpoint), + // Disable SDK retry: the LlmGateway owns rate-limit retry policy. + RetryPolicy = new ClientRetryPolicy(maxRetries: 0) + }); builder.Services.AddRockBotChatClient( openAiClient.GetChatClient(modelId).AsIChatClient()); diff --git a/tests/RockBot.Host.Tests/DefaultLlmRateLimitClassifierTests.cs b/tests/RockBot.Host.Tests/DefaultLlmRateLimitClassifierTests.cs new file mode 100644 index 00000000..15c1e8d0 --- /dev/null +++ b/tests/RockBot.Host.Tests/DefaultLlmRateLimitClassifierTests.cs @@ -0,0 +1,81 @@ +using System.Net; + +namespace RockBot.Host.Tests; + +[TestClass] +public class DefaultLlmRateLimitClassifierTests +{ + private readonly DefaultLlmRateLimitClassifier _classifier = new(); + + [TestMethod] + public void TryClassify_NonRateLimitException_ReturnsFalse() + { + var ex = new InvalidOperationException("boom"); + var result = _classifier.TryClassify(ex, out var retryAfter); + Assert.IsFalse(result); + Assert.IsNull(retryAfter); + } + + [TestMethod] + public void TryClassify_HttpRequestException_429_ReturnsTrue() + { + var ex = new HttpRequestException( + HttpRequestError.Unknown, + "rate limited", + inner: null, + statusCode: HttpStatusCode.TooManyRequests); + + var result = _classifier.TryClassify(ex, out var retryAfter); + Assert.IsTrue(result); + // HttpRequestException carries no headers, so no Retry-After is extracted. + Assert.IsNull(retryAfter); + } + + [TestMethod] + public void TryClassify_HttpRequestException_NotRateLimit_ReturnsFalse() + { + var ex = new HttpRequestException( + HttpRequestError.Unknown, + "server error", + inner: null, + statusCode: HttpStatusCode.InternalServerError); + + var result = _classifier.TryClassify(ex, out var retryAfter); + Assert.IsFalse(result); + Assert.IsNull(retryAfter); + } + + [TestMethod] + public void TryClassify_WrappedRateLimit_WalksInnerExceptions() + { + var inner = new HttpRequestException( + HttpRequestError.Unknown, + "rate limited", + inner: null, + statusCode: HttpStatusCode.TooManyRequests); + var outer = new InvalidOperationException("wrapper", inner); + + var result = _classifier.TryClassify(outer, out _); + Assert.IsTrue(result, "Classifier must walk the inner-exception chain"); + } + + [TestMethod] + public void TryClassify_Aggregate_DoesNotWalkInnerExceptionsCollection() + { + // Document current behavior: AggregateException's InnerExceptions collection is + // not walked. Only AggregateException.InnerException (the first inner) is. + // If a future caller wraps via AggregateException, this test will fail and force + // an explicit decision. + var rateLimit = new HttpRequestException( + HttpRequestError.Unknown, + "rate limited", + inner: null, + statusCode: HttpStatusCode.TooManyRequests); + var agg = new AggregateException(new InvalidOperationException("first"), rateLimit); + + var result = _classifier.TryClassify(agg, out _); + // agg.InnerException is the first one (InvalidOperationException), not the + // rate-limit one. Classifier walks only the linear chain. + Assert.IsFalse(result); + } +} diff --git a/tests/RockBot.Host.Tests/LlmGatewayTests.cs b/tests/RockBot.Host.Tests/LlmGatewayTests.cs index 1c1fe9df..5852cb7f 100644 --- a/tests/RockBot.Host.Tests/LlmGatewayTests.cs +++ b/tests/RockBot.Host.Tests/LlmGatewayTests.cs @@ -6,15 +6,65 @@ namespace RockBot.Host.Tests; [TestClass] public class LlmGatewayTests { - private static LlmGateway CreateGateway(int low = 2, int balanced = 2, int high = 2) + private static LlmGateway CreateGateway( + int low = 2, + int balanced = 2, + int high = 2, + int maxRetries = 0, + int maxBackoffSeconds = 16, + ILlmRateLimitClassifier? classifier = null) { var options = Options.Create(new LlmGatewayOptions { LowMaxConcurrent = low, BalancedMaxConcurrent = balanced, HighMaxConcurrent = high, + MaxRateLimitRetries = maxRetries, + MaxBackoffSeconds = maxBackoffSeconds, }); - return new LlmGateway(options, NullLogger.Instance); + return new LlmGateway( + options, + classifier ?? new NeverRateLimitClassifier(), + NullLogger.Instance); + } + + /// Stub classifier that never reports rate-limit conditions. + private sealed class NeverRateLimitClassifier : ILlmRateLimitClassifier + { + public bool TryClassify(Exception exception, out TimeSpan? retryAfter) + { + retryAfter = null; + return false; + } + } + + /// + /// Stub classifier that recognises a custom marker exception as rate-limit and + /// surfaces an optional Retry-After hint carried on the exception. + /// + private sealed class FakeRateLimitException(TimeSpan? retryAfter = null) : Exception("simulated 429") + { + public TimeSpan? RetryAfter { get; } = retryAfter; + } + + private sealed class FakeRateLimitClassifier : ILlmRateLimitClassifier + { + public bool TryClassify(Exception exception, out TimeSpan? retryAfter) + { + // Walk the inner-exception chain so wrapped throws are still recognised. + var current = exception; + while (current is not null) + { + if (current is FakeRateLimitException frle) + { + retryAfter = frle.RetryAfter; + return true; + } + current = current.InnerException; + } + retryAfter = null; + return false; + } } [TestMethod] @@ -219,7 +269,169 @@ public void Constructor_CapBelowOne_Throws() { var bad = Options.Create(new LlmGatewayOptions { LowMaxConcurrent = 0 }); Assert.ThrowsExactly(() => - new LlmGateway(bad, NullLogger.Instance)); + new LlmGateway(bad, new NeverRateLimitClassifier(), NullLogger.Instance)); + } + + [TestMethod] + public void Constructor_NegativeMaxRetries_Throws() + { + var bad = Options.Create(new LlmGatewayOptions { MaxRateLimitRetries = -1 }); + Assert.ThrowsExactly(() => + new LlmGateway(bad, new NeverRateLimitClassifier(), NullLogger.Instance)); + } + + [TestMethod] + public void Constructor_MaxBackoffBelowOne_Throws() + { + var bad = Options.Create(new LlmGatewayOptions { MaxBackoffSeconds = 0 }); + Assert.ThrowsExactly(() => + new LlmGateway(bad, new NeverRateLimitClassifier(), NullLogger.Instance)); + } + + [TestMethod] + public async Task ExecuteAsync_NonRateLimitError_NotRetried() + { + using var gateway = CreateGateway(maxRetries: 5, classifier: new FakeRateLimitClassifier()); + var attempts = 0; + + await Assert.ThrowsExactlyAsync(async () => + await gateway.ExecuteAsync( + ModelTier.Balanced, + ct => + { + attempts++; + throw new InvalidOperationException("not a 429"); + }, + CancellationToken.None)); + + Assert.AreEqual(1, attempts, "Non-rate-limit errors must not be retried"); + } + + [TestMethod] + public async Task ExecuteAsync_RateLimit_RetriesAndSucceeds() + { + using var gateway = CreateGateway( + maxRetries: 3, + classifier: new FakeRateLimitClassifier()); + var attempts = 0; + + var result = await gateway.ExecuteAsync( + ModelTier.Balanced, + ct => + { + attempts++; + if (attempts < 3) + throw new FakeRateLimitException(retryAfter: TimeSpan.FromMilliseconds(1)); + return Task.FromResult(42); + }, + CancellationToken.None); + + Assert.AreEqual(42, result); + Assert.AreEqual(3, attempts, "Should have retried twice before success"); + } + + [TestMethod] + public async Task ExecuteAsync_RateLimit_ExhaustsRetriesAndThrows() + { + using var gateway = CreateGateway( + maxRetries: 2, + classifier: new FakeRateLimitClassifier()); + var attempts = 0; + + await Assert.ThrowsExactlyAsync(async () => + await gateway.ExecuteAsync( + ModelTier.Balanced, + ct => + { + attempts++; + throw new FakeRateLimitException(retryAfter: TimeSpan.FromMilliseconds(1)); + }, + CancellationToken.None)); + + // Initial attempt + 2 retries = 3 attempts total. + Assert.AreEqual(3, attempts); + } + + [TestMethod] + public async Task ExecuteAsync_RateLimit_HonorsRetryAfter() + { + using var gateway = CreateGateway( + maxRetries: 1, + classifier: new FakeRateLimitClassifier()); + var sw = System.Diagnostics.Stopwatch.StartNew(); + var attempts = 0; + + await gateway.ExecuteAsync( + ModelTier.Balanced, + ct => + { + attempts++; + if (attempts == 1) + throw new FakeRateLimitException(retryAfter: TimeSpan.FromMilliseconds(200)); + return Task.FromResult(0); + }, + CancellationToken.None); + + sw.Stop(); + Assert.AreEqual(2, attempts); + Assert.IsTrue(sw.ElapsedMilliseconds >= 180, + $"Expected at least ~200ms wait honoring Retry-After, but only {sw.ElapsedMilliseconds}ms elapsed"); + } + + [TestMethod] + public async Task ExecuteAsync_RateLimit_CancelDuringWait_Aborts() + { + using var gateway = CreateGateway( + maxRetries: 5, + classifier: new FakeRateLimitClassifier()); + using var cts = new CancellationTokenSource(); + var attempts = 0; + + var task = gateway.ExecuteAsync( + ModelTier.Balanced, + ct => + { + attempts++; + throw new FakeRateLimitException(retryAfter: TimeSpan.FromSeconds(30)); + }, + cts.Token); + + // Let the first attempt run and start the retry wait. + await WaitUntilAsync(() => attempts == 1, TimeSpan.FromSeconds(5)); + + cts.Cancel(); + + await Assert.ThrowsAsync(async () => await task); + Assert.AreEqual(1, attempts, "Cancellation should abort during the retry wait"); + } + + [TestMethod] + public async Task ExecuteAsync_RateLimit_NoRetryAfter_UsesExponentialBackoff() + { + var maxBackoffSeconds = 1; // Cap backoff at 1s so the test runs quickly. + using var gateway = CreateGateway( + maxRetries: 1, + maxBackoffSeconds: maxBackoffSeconds, + classifier: new FakeRateLimitClassifier()); + var sw = System.Diagnostics.Stopwatch.StartNew(); + var attempts = 0; + + await gateway.ExecuteAsync( + ModelTier.Balanced, + ct => + { + attempts++; + if (attempts == 1) + throw new FakeRateLimitException(retryAfter: null); + return Task.FromResult(0); + }, + CancellationToken.None); + + sw.Stop(); + Assert.AreEqual(2, attempts); + // Attempt 1 backoff is 2^0 = 1 second; capped at maxBackoff so still 1s. + Assert.IsTrue(sw.ElapsedMilliseconds >= 900, + $"Expected at least ~1s exponential backoff, but only {sw.ElapsedMilliseconds}ms elapsed"); } private static async Task WaitUntilAsync(Func predicate, TimeSpan timeout)