From 76300ed4167d6c034079b574338f03a22e0e2765 Mon Sep 17 00:00:00 2001 From: Benedikt Date: Fri, 6 Feb 2026 17:11:22 +0100 Subject: [PATCH] fix: handshake worker starve when loading external certificate --- handshake.go | 55 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/handshake.go b/handshake.go index d14c1382..a227015b 100644 --- a/handshake.go +++ b/handshake.go @@ -298,9 +298,9 @@ func (cfg *Config) getCertDuringHandshake(ctx context.Context, hello *tls.Client // domain, avoid pounding manager or storage thousands of times simultaneously. We use a similar sync // strategy for obtaining certificate during handshake. certLoadWaitChansMu.Lock() - wait, ok := certLoadWaitChans[name] + waiter, ok := certLoadWaitChans[name] if ok { - // another goroutine is already loading the cert; just wait and we'll get it from the in-memory cache + // another goroutine is already loading the cert; just wait certLoadWaitChansMu.Unlock() timeout := time.NewTimer(2 * time.Minute) @@ -310,33 +310,44 @@ func (cfg *Config) getCertDuringHandshake(ctx context.Context, hello *tls.Client case <-ctx.Done(): timeout.Stop() return Certificate{}, ctx.Err() - case <-wait: + case <-waiter.done: timeout.Stop() } - return cfg.getCertDuringHandshake(ctx, hello, false) - } else { - // no other goroutine is currently trying to load this cert - wait = make(chan struct{}) - certLoadWaitChans[name] = wait - certLoadWaitChansMu.Unlock() + // If the leader got a result from an external cert manager, use it + // directly — these certs are not added to the cache, so a recursive + // cache lookup would miss. For cached certs (on-demand, managed), + // the waiter result will be empty and we fall through to the + // original recursive lookup. + if !waiter.cert.Empty() || waiter.err != nil { + return waiter.cert, waiter.err + } - // unblock others and clean up when we're done - defer func() { - certLoadWaitChansMu.Lock() - close(wait) - delete(certLoadWaitChans, name) - certLoadWaitChansMu.Unlock() - }() + return cfg.getCertDuringHandshake(ctx, hello, false) } + // no other goroutine is currently trying to load this cert + waiter = &certLoadWaiter{done: make(chan struct{})} + certLoadWaitChans[name] = waiter + certLoadWaitChansMu.Unlock() + + // unblock others and clean up when we're done + defer func() { + certLoadWaitChansMu.Lock() + close(waiter.done) + delete(certLoadWaitChans, name) + certLoadWaitChansMu.Unlock() + }() + // If an external Manager is configured, try to get it from them. // Only continue to use our own logic if it returns empty+nil. externalCert, err := cfg.getCertFromAnyCertManager(ctx, hello, logger) if err != nil { + waiter.err = err return Certificate{}, err } if !externalCert.Empty() { + waiter.cert = externalCert return externalCert, nil } @@ -946,9 +957,19 @@ var ( obtainCertWaitChansMu sync.Mutex ) +// certLoadWaiter coordinates concurrent certificate loading for the same name. +// The leader populates the result and closes the channel; waiters read the result +// after the channel is closed. This allows externally-managed certificates (which +// are not cached) to be shared directly with waiting goroutines. +type certLoadWaiter struct { + done chan struct{} + cert Certificate + err error +} + // TODO: this lockset should probably be per-cache var ( - certLoadWaitChans = make(map[string]chan struct{}) + certLoadWaitChans = make(map[string]*certLoadWaiter) certLoadWaitChansMu sync.Mutex )