MarimerLLC · rockfordlhotka · May 7, 2026
diff --git a/src/RockBot.Agent/Program.cs b/src/RockBot.Agent/Program.cs
@@ -1,4 +1,5 @@
 using System.ClientModel;
+using System.ClientModel.Primitives;
 using Microsoft.Extensions.AI;
 using Microsoft.Extensions.Configuration;
 using Microsoft.Extensions.DependencyInjection;
@@ -127,7 +128,10 @@ IChatClient BuildOpenAIClient(LlmTierConfig config)
             Endpoint = new Uri(config.Endpoint!),
             // Extend from the 100s default — subagents with large tool sets generate
             // longer responses that can exceed the default before the body is fully read.
-            NetworkTimeout = TimeSpan.FromMinutes(5)
+            NetworkTimeout = TimeSpan.FromMinutes(5),
+            // Disable SDK retry: the LlmGateway owns rate-limit retry policy. Without
+            // this, gateway and SDK both retry on 429 and silently double-retry.
+            RetryPolicy = new ClientRetryPolicy(maxRetries: 0)
         })
         .GetChatClient(config.ModelId!).AsIChatClient();
 }

diff --git a/src/RockBot.Host.Abstractions/LlmGatewayOptions.cs b/src/RockBot.Host.Abstractions/LlmGatewayOptions.cs
@@ -27,4 +27,20 @@ public sealed class LlmGatewayOptions
     /// Expensive judgment calls; lower cap.
     /// </summary>
     public int HighMaxConcurrent { get; set; } = 2;
+
+    /// <summary>
+    /// Maximum number of retry attempts on rate-limit (HTTP 429) responses before
+    /// the call surfaces the failure to the caller. Each retry honors any
+    /// <c>Retry-After</c> response header; in its absence, exponential backoff
+    /// (1s, 2s, 4s, 8s, ...) is used, capped by <see cref="MaxBackoffSeconds"/>.
+    /// Set to zero to disable retry on rate-limit errors.
+    /// </summary>
+    public int MaxRateLimitRetries { get; set; } = 5;
+
+    /// <summary>
+    /// Maximum backoff (in seconds) between retry attempts when no
+    /// <c>Retry-After</c> header is supplied by the provider. Caps the
+    /// exponential growth of fallback backoff.
+    /// </summary>
+    public int MaxBackoffSeconds { get; set; } = 16;
 }
diff --git a/src/RockBot.Host/DefaultLlmRateLimitClassifier.cs b/src/RockBot.Host/DefaultLlmRateLimitClassifier.cs
@@ -0,0 +1,67 @@
+using System.ClientModel;
+using System.ClientModel.Primitives;
+using System.Globalization;
+using System.Net;
+
+namespace RockBot.Host;
+
+/// <summary>
+/// Default <see cref="ILlmRateLimitClassifier"/>: detects rate-limit errors
+/// surfaced by the OpenAI SDK (via <see cref="ClientResultException"/>) and
+/// generic <see cref="HttpRequestException"/>s carrying a 429 status. Walks the
+/// exception chain so wrapped errors are caught.
+/// </summary>
+internal sealed class DefaultLlmRateLimitClassifier : ILlmRateLimitClassifier
+{
+    public bool TryClassify(Exception exception, out TimeSpan? retryAfter)
+    {
+        retryAfter = null;
+
+        var current = exception;
+        while (current is not null)
+        {
+            if (current is ClientResultException cre && cre.Status == 429)
+            {
+                retryAfter = ParseRetryAfter(cre.GetRawResponse());
+                return true;
+            }
+
+            if (current is HttpRequestException hre && hre.StatusCode == HttpStatusCode.TooManyRequests)
+            {
+                // HttpRequestException does not carry response headers, so we
+                // cannot extract Retry-After here. The gateway falls back to
+                // exponential backoff.
+                return true;
+            }
+
+            current = current.InnerException;
+        }
+
+        return false;
+    }
+
+    private static TimeSpan? ParseRetryAfter(PipelineResponse? response)
+    {
+        if (response is null) return null;
+        if (!response.Headers.TryGetValue("retry-after", out var raw) || string.IsNullOrEmpty(raw))
+            return null;
+
+        // Numeric form: integer seconds.
+        if (int.TryParse(raw, NumberStyles.Integer, CultureInfo.InvariantCulture, out var seconds)
+            && seconds >= 0)
+        {
+            return TimeSpan.FromSeconds(seconds);
+        }
+
+        // HTTP-date form (RFC 7231).
+        if (DateTimeOffset.TryParse(raw, CultureInfo.InvariantCulture,
+                DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal,
+                out var when))
+        {
+            var diff = when - DateTimeOffset.UtcNow;
+            return diff > TimeSpan.Zero ? diff : TimeSpan.Zero;
+        }
+
+        return null;
+    }
+}
diff --git a/src/RockBot.Host/HostDiagnostics.cs b/src/RockBot.Host/HostDiagnostics.cs
@@ -58,6 +58,18 @@ public static class HostDiagnostics
             unit: "ms",
             description: "Time spent waiting for a per-tier LLM gateway slot");
 
+    /// <summary>
+    /// Number of rate-limit retries performed by the LLM gateway. Tagged by tier
+    /// and by retry-after source (<c>"header"</c> when the provider supplied a
+    /// <c>Retry-After</c> hint, <c>"backoff"</c> when the gateway used its
+    /// exponential-backoff fallback).
+    /// </summary>
+    public static readonly Counter<long> LlmGatewayRateLimitRetries =
+        Meter.CreateCounter<long>(
+            "rockbot.llm.gateway.rate_limit_retries",
+            unit: "{retry}",
+            description: "Number of rate-limit (429) retries performed by the LLM gateway");
+
     // ── Agent turn metrics — recorded at architectural boundaries ─────────────
 
     /// <summary>Duration from user message receipt to final reply published.</summary>

diff --git a/src/RockBot.Host/ILlmRateLimitClassifier.cs b/src/RockBot.Host/ILlmRateLimitClassifier.cs
@@ -0,0 +1,22 @@
+namespace RockBot.Host;
+
+/// <summary>
+/// Classifies exceptions thrown by underlying LLM SDK calls to determine whether
+/// they represent a rate-limit (HTTP 429) condition that the gateway should retry.
+/// </summary>
+/// <remarks>
+/// Implementations walk the exception chain to find a rate-limit indicator and,
+/// where possible, extract the provider's <c>Retry-After</c> hint so the gateway
+/// can honor it precisely instead of falling back to exponential backoff.
+/// Pluggable so different providers (OpenAI, Anthropic-direct, Copilot, etc.)
+/// can surface their own rate-limit shapes.
+/// </remarks>
+internal interface ILlmRateLimitClassifier
+{
+    /// <summary>
+    /// Returns <c>true</c> if <paramref name="exception"/> indicates a rate-limit
+    /// condition that should be retried. <paramref name="retryAfter"/> is set to
+    /// the provider-supplied wait duration when available.
+    /// </summary>
+    bool TryClassify(Exception exception, out TimeSpan? retryAfter);
+}
diff --git a/src/RockBot.Host/LlmGateway.cs b/src/RockBot.Host/LlmGateway.cs
@@ -28,9 +28,15 @@ namespace RockBot.Host;
 internal sealed class LlmGateway : ILlmGateway, IDisposable
 {
     private readonly TierSlot[] _slots;
+    private readonly ILlmRateLimitClassifier _classifier;
     private readonly ILogger<LlmGateway> _logger;
+    private readonly int _maxRetries;
+    private readonly int _maxBackoffSeconds;
 
-    public LlmGateway(IOptions<LlmGatewayOptions> options, ILogger<LlmGateway> logger)
+    public LlmGateway(
+        IOptions<LlmGatewayOptions> options,
+        ILlmRateLimitClassifier classifier,
+        ILogger<LlmGateway> logger)
     {
         var opts = options.Value;
 
@@ -53,11 +59,26 @@ public LlmGateway(IOptions<LlmGatewayOptions> options, ILogger<LlmGateway> logge
             _slots[(int)tier] = new TierSlot(cap);
         }
 
+        if (opts.MaxRateLimitRetries < 0)
+            throw new ArgumentOutOfRangeException(
+                nameof(options),
+                $"LlmGatewayOptions.MaxRateLimitRetries must be >= 0 (was {opts.MaxRateLimitRetries}).");
+
+        if (opts.MaxBackoffSeconds < 1)
+            throw new ArgumentOutOfRangeException(
+                nameof(options),
+                $"LlmGatewayOptions.MaxBackoffSeconds must be >= 1 (was {opts.MaxBackoffSeconds}).");
+
+        _classifier = classifier;
         _logger = logger;
+        _maxRetries = opts.MaxRateLimitRetries;
+        _maxBackoffSeconds = opts.MaxBackoffSeconds;
 
         _logger.LogInformation(
-            "LlmGateway: per-tier concurrency caps Low={Low} Balanced={Balanced} High={High}",
-            opts.LowMaxConcurrent, opts.BalancedMaxConcurrent, opts.HighMaxConcurrent);
+            "LlmGateway: per-tier concurrency caps Low={Low} Balanced={Balanced} High={High}, " +
+            "rate-limit retries Max={MaxRetries} backoff cap={MaxBackoff}s",
+            opts.LowMaxConcurrent, opts.BalancedMaxConcurrent, opts.HighMaxConcurrent,
+            _maxRetries, _maxBackoffSeconds);
     }
 
     /// <summary>
@@ -99,7 +120,8 @@ public async Task<T> ExecuteAsync<T>(
         Interlocked.Increment(ref slot.InFlight);
         try
         {
-            return await operation(cancellationToken).ConfigureAwait(false);
+            return await ExecuteWithRetryAsync(tier, tierTag, operation, cancellationToken)
+                .ConfigureAwait(false);
         }
         finally
         {
@@ -108,6 +130,65 @@ public async Task<T> ExecuteAsync<T>(
         }
     }
 
+    /// <summary>
+    /// Invokes <paramref name="operation"/> and, on rate-limit (HTTP 429) failures,
+    /// retries up to <c>MaxRateLimitRetries</c> times. The slot is held throughout
+    /// — releasing during retry waits does not help, since rate limits are per-tier
+    /// so any other call in the same tier would hit the same limit.
+    /// </summary>
+    private async Task<T> ExecuteWithRetryAsync<T>(
+        ModelTier tier,
+        KeyValuePair<string, object?> tierTag,
+        Func<CancellationToken, Task<T>> operation,
+        CancellationToken cancellationToken)
+    {
+        var attempt = 0;
+        while (true)
+        {
+            try
+            {
+                return await operation(cancellationToken).ConfigureAwait(false);
+            }
+            catch (Exception ex) when (
+                attempt < _maxRetries
+                && !cancellationToken.IsCancellationRequested
+                && _classifier.TryClassify(ex, out var classifierRetryAfter))
+            {
+                attempt++;
+
+                var source = classifierRetryAfter.HasValue ? "header" : "backoff";
+                var wait = classifierRetryAfter ?? ComputeBackoff(attempt);
+
+                _logger.LogWarning(
+                    "LlmGateway: rate-limit on tier {Tier} (attempt {Attempt}/{Max}); " +
+                    "waiting {WaitSeconds}s ({Source}) before retry",
+                    tier, attempt, _maxRetries, wait.TotalSeconds, source);
+
+                HostDiagnostics.LlmGatewayRateLimitRetries.Add(
+                    1,
+                    tierTag,
+                    new KeyValuePair<string, object?>("rockbot.llm.gateway.retry_after_source", source));
+
+                try
+                {
+                    await Task.Delay(wait, cancellationToken).ConfigureAwait(false);
+                }
+                catch (OperationCanceledException)
+                {
+                    throw;
+                }
+            }
+        }
+    }
+
+    private TimeSpan ComputeBackoff(int attempt)
+    {
+        // 1s, 2s, 4s, 8s, ..., capped at MaxBackoffSeconds.
+        // attempt is 1-based.
+        var seconds = Math.Min(Math.Pow(2, attempt - 1), _maxBackoffSeconds);
+        return TimeSpan.FromSeconds(seconds);
+    }
+
     public void Dispose()
     {
         foreach (var slot in _slots)

diff --git a/src/RockBot.Host/ServiceCollectionExtensions.cs b/src/RockBot.Host/ServiceCollectionExtensions.cs
@@ -35,6 +35,7 @@ public static IServiceCollection AddRockBotHost(
         services.AddSingleton<LlmCostEstimator>();
         services.Configure<LlmPricingOptions>(_ => { });
         services.Configure<LlmGatewayOptions>(_ => { });
+        services.AddSingleton<ILlmRateLimitClassifier, DefaultLlmRateLimitClassifier>();
         services.AddSingleton<ILlmGateway, LlmGateway>();
         services.AddTransient<ILlmClient, LlmClient>();
         services.AddSingleton<IToolProgressNotifier, ToolProgressNotifier>();

diff --git a/src/RockBot.ResearchAgent/Program.cs b/src/RockBot.ResearchAgent/Program.cs
@@ -1,4 +1,5 @@
 using System.ClientModel;
+using System.ClientModel.Primitives;
 using Microsoft.Extensions.AI;
 using Microsoft.Extensions.Configuration;
 using Microsoft.Extensions.DependencyInjection;
@@ -49,7 +50,13 @@ IChatClient BuildClient(LlmTierConfig config)
     {
         return new OpenAIClient(
             new ApiKeyCredential(config.ApiKey!),
-            new OpenAIClientOptions { Endpoint = new Uri(config.Endpoint!) })
+            new OpenAIClientOptions
+            {
+                Endpoint = new Uri(config.Endpoint!),
+                // Disable SDK retry: the LlmGateway owns rate-limit retry policy.
+                // Without this, gateway and SDK both retry on 429 and silently double-retry.
+                RetryPolicy = new ClientRetryPolicy(maxRetries: 0)
+            })
             .GetChatClient(config.ModelId!).AsIChatClient();
     }
 

diff --git a/src/RockBot.SampleAgent/Program.cs b/src/RockBot.SampleAgent/Program.cs
@@ -1,4 +1,5 @@
 using System.ClientModel;
+using System.ClientModel.Primitives;
 using Microsoft.Extensions.AI;
 using Microsoft.Extensions.Configuration;
 using Microsoft.Extensions.DependencyInjection;
@@ -26,7 +27,12 @@
 {
     var openAiClient = new OpenAIClient(
         new ApiKeyCredential(apiKey),
-        new OpenAIClientOptions { Endpoint = new Uri(endpoint) });
+        new OpenAIClientOptions
+        {
+            Endpoint = new Uri(endpoint),
+            // Disable SDK retry: the LlmGateway owns rate-limit retry policy.
+            RetryPolicy = new ClientRetryPolicy(maxRetries: 0)
+        });
 
     builder.Services.AddRockBotChatClient(
         openAiClient.GetChatClient(modelId).AsIChatClient());