Skip to content

Commit ec74bc6

Browse files
committed
fix(rds-refresh): prevent race condition in WaitForRefreshComplete
The function could return success prematurely if DBLab's status still showed StatusFinished from a previous refresh. Since /full-refresh triggers the operation in a goroutine, polling immediately after could see stale status. Now tracks whether refresh has actually started before accepting StatusFinished as success. This prevents premature clone deletion.
1 parent cb8223a commit ec74bc6

File tree

1 file changed

+27
-6
lines changed

1 file changed

+27
-6
lines changed

rds-refresh/dblab.go

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,18 +146,26 @@ func (c *DBLabClient) TriggerFullRefresh(ctx context.Context) error {
146146
}
147147

148148
// WaitForRefreshComplete polls the DBLab status until refresh is complete or timeout.
149+
// It first waits for the refresh to start (status changes from finished/inactive),
150+
// then waits for it to complete. This prevents race conditions where stale status
151+
// from a previous refresh could cause premature return.
149152
func (c *DBLabClient) WaitForRefreshComplete(ctx context.Context, pollInterval, timeout time.Duration) error {
150153
ticker := time.NewTicker(pollInterval)
151154
defer ticker.Stop()
152155

153156
timeoutTimer := time.NewTimer(timeout)
154157
defer timeoutTimer.Stop()
155158

159+
refreshStarted := false
160+
156161
for {
157162
select {
158163
case <-ctx.Done():
159164
return ctx.Err()
160165
case <-timeoutTimer.C:
166+
if !refreshStarted {
167+
return fmt.Errorf("timeout waiting for refresh to start after %v", timeout)
168+
}
161169
return fmt.Errorf("timeout waiting for refresh to complete after %v", timeout)
162170
case <-ticker.C:
163171
status, err := c.GetStatus(ctx)
@@ -168,21 +176,34 @@ func (c *DBLabClient) WaitForRefreshComplete(ctx context.Context, pollInterval,
168176
retrievalStatus := status.Retrieving.Status
169177

170178
switch retrievalStatus {
179+
case StatusRefreshing, StatusSnapshotting, StatusRenewed, StatusPending:
180+
// refresh is in progress - mark as started
181+
refreshStarted = true
182+
continue
171183
case StatusFinished:
184+
if !refreshStarted {
185+
// still showing old status, refresh hasn't started yet
186+
continue
187+
}
188+
// refresh started and now finished
172189
return nil
173190
case StatusFailed:
191+
if !refreshStarted {
192+
// old failure status, refresh hasn't started yet
193+
continue
194+
}
174195
if len(status.Retrieving.Alerts) > 0 {
175196
for _, alert := range status.Retrieving.Alerts {
176197
return fmt.Errorf("refresh failed: %s", alert.Message)
177198
}
178199
}
179-
180200
return fmt.Errorf("refresh failed (no details available)")
181-
case StatusRefreshing, StatusSnapshotting, StatusRenewed:
182-
// still in progress
183-
continue
184-
case StatusInactive, StatusPending:
185-
// not started yet or pending
201+
case StatusInactive:
202+
if refreshStarted {
203+
// was running but now inactive - unusual, treat as failure
204+
return fmt.Errorf("refresh stopped unexpectedly (status: inactive)")
205+
}
206+
// not started yet
186207
continue
187208
default:
188209
continue

0 commit comments

Comments
 (0)