Skip to content

Commit eff8531

Browse files
craig[bot]arulajmaniiskettaneh
committed
158525: kv: introduce TestLeaseTransferAfterStoreLivenessSupportWithdrawn r=iskettaneh a=arulajmani Test to demonstrate the hazard described in #158513. Release note: None 158531: kvserver: deflake TestWriteLoadStatsAccounting r=iskettaneh a=iskettaneh This commit increases the epsilonAllowed in the test TestWriteLoadStatsAccounting. Note that we recently increased it from 6 to 7, but this doesn't seem to have been enough. Fixes: #158497 Release note: None Co-authored-by: Arul Ajmani <arulajmani@gmail.com> Co-authored-by: iskettaneh <173953022+iskettaneh@users.noreply.github.com>
3 parents 84f0bc2 + b8f02d9 + 6fce540 commit eff8531

File tree

2 files changed

+184
-1
lines changed

2 files changed

+184
-1
lines changed

pkg/kv/kvserver/replica_rankings_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ func TestWriteLoadStatsAccounting(t *testing.T) {
221221
tc := serverutils.StartCluster(t, 3, args)
222222
defer tc.Stopper().Stop(ctx)
223223

224-
const epsilonAllowed = 6
224+
const epsilonAllowed = 10
225225

226226
ts := tc.Server(0)
227227
db := ts.DB()

pkg/kv/kvserver/replica_store_liveness_test.go

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,25 @@ package kvserver_test
88
import (
99
"context"
1010
"fmt"
11+
"sync/atomic"
1112
"testing"
13+
"time"
1214

1315
"github.com/cockroachdb/cockroach/pkg/base"
16+
"github.com/cockroachdb/cockroach/pkg/config/zonepb"
1417
"github.com/cockroachdb/cockroach/pkg/keys"
1518
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
19+
slpb "github.com/cockroachdb/cockroach/pkg/kv/kvserver/storeliveness/storelivenesspb"
20+
"github.com/cockroachdb/cockroach/pkg/raft/raftpb"
1621
"github.com/cockroachdb/cockroach/pkg/roachpb"
22+
"github.com/cockroachdb/cockroach/pkg/server"
1723
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
24+
"github.com/cockroachdb/cockroach/pkg/spanconfig"
1825
"github.com/cockroachdb/cockroach/pkg/testutils"
1926
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
2027
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
2128
"github.com/cockroachdb/cockroach/pkg/util/log"
29+
"github.com/cockroachdb/errors"
2230
"github.com/stretchr/testify/require"
2331
)
2432

@@ -174,3 +182,178 @@ func BenchmarkRaftFortificationEnabledForRangeID(b *testing.B) {
174182
_ = kvserver.RaftFortificationEnabledForRangeID(0.5, roachpb.RangeID(i))
175183
}
176184
}
185+
186+
// TestLeaseTransferAfterStoreLivenessSupportWithdrawn is a regression test for
187+
// https://github.com/cockroachdb/cockroach/issues/158513. It demonstrates the
188+
// hazard where a lease can be transferred back to a store that has recently had
189+
// its support withdrawn.
190+
func TestLeaseTransferAfterStoreLivenessSupportWithdrawn(t *testing.T) {
191+
defer leaktest.AfterTest(t)()
192+
defer log.Scope(t).Close(t)
193+
194+
ctx := context.Background()
195+
st := cluster.MakeTestingClusterSettings()
196+
// Ensure we're using leader leases which rely on store liveness.
197+
kvserver.OverrideDefaultLeaseType(ctx, &st.SV, roachpb.LeaseLeader)
198+
199+
const numNodes = 3
200+
// Set up zone config with lease preference on n1 (rack=1).
201+
zcfg := zonepb.DefaultZoneConfig()
202+
zcfg.LeasePreferences = []zonepb.LeasePreference{
203+
{
204+
Constraints: []zonepb.Constraint{
205+
{
206+
Type: zonepb.Constraint_REQUIRED,
207+
Key: "rack",
208+
Value: "1",
209+
},
210+
},
211+
},
212+
}
213+
214+
// Create server args with localities for each node.
215+
serverArgs := make(map[int]base.TestServerArgs, numNodes)
216+
for i := 0; i < numNodes; i++ {
217+
serverArgs[i] = base.TestServerArgs{
218+
Settings: st,
219+
Locality: roachpb.Locality{
220+
Tiers: []roachpb.Tier{{Key: "rack", Value: fmt.Sprintf("%d", i+1)}},
221+
},
222+
Knobs: base.TestingKnobs{
223+
SpanConfig: &spanconfig.TestingKnobs{
224+
ConfigureScratchRange: true,
225+
},
226+
Server: &server.TestingKnobs{
227+
DefaultZoneConfigOverride: &zcfg,
228+
},
229+
},
230+
RaftConfig: base.RaftConfig{
231+
// Speed up the test by using faster raft ticks.
232+
RaftTickInterval: 100 * time.Millisecond,
233+
RaftElectionTimeoutTicks: 5,
234+
RaftHeartbeatIntervalTicks: 1,
235+
},
236+
}
237+
}
238+
239+
tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{
240+
ReplicationMode: base.ReplicationManual,
241+
ServerArgsPerNode: serverArgs,
242+
})
243+
defer tc.Stopper().Stop(ctx)
244+
245+
// Setup the test by creating a scratch range with voters on n1, n2, and n3.
246+
// Ensure the lease is on n1 to begin with.
247+
scratchKey := tc.ScratchRange(t)
248+
desc := tc.AddVotersOrFatal(t, scratchKey, tc.Targets(1, 2)...)
249+
tc.TransferRangeLeaseOrFatal(t, desc, tc.Target(0))
250+
var repl1 *kvserver.Replica
251+
testutils.SucceedsSoon(t, func() error {
252+
repl1 = tc.GetFirstStoreFromServer(t, 0).LookupReplica(roachpb.RKey(scratchKey))
253+
if repl1 == nil {
254+
return errors.New("replica not found")
255+
}
256+
leaseStatus := repl1.CurrentLeaseStatus(ctx)
257+
if !leaseStatus.IsValid() {
258+
return errors.New("lease not valid")
259+
}
260+
if leaseStatus.Lease.Type() != roachpb.LeaseLeader {
261+
return errors.Errorf("lease type is %s, expected leader lease", leaseStatus.Lease.Type())
262+
}
263+
if leaseStatus.Lease.Replica.StoreID != tc.Target(0).StoreID {
264+
return errors.Errorf("lease holder is %d, expected %d",
265+
leaseStatus.Lease.Replica.StoreID, tc.Target(0).StoreID)
266+
}
267+
return nil
268+
})
269+
270+
// Set up store liveness heartbeat dropping from n1 to n2 and n3, which then
271+
// causes n2 and n3 to withdraw support from n1.
272+
var dropMessages atomic.Bool
273+
dropMessages.Store(true)
274+
dropStoreLivenessHeartbeatsFrom(t, tc.Server(1), desc, []roachpb.ReplicaID{1}, &dropMessages)
275+
dropStoreLivenessHeartbeatsFrom(t, tc.Server(2), desc, []roachpb.ReplicaID{1}, &dropMessages)
276+
t.Logf("blocking store liveness heartbeats from n1 to n2 and n3")
277+
278+
// Wait for n2 and n3 to withdraw support from n1.
279+
store2 := tc.GetFirstStoreFromServer(t, 1)
280+
store3 := tc.GetFirstStoreFromServer(t, 2)
281+
n1StoreIdent := slpb.StoreIdent{NodeID: tc.Target(0).NodeID, StoreID: tc.Target(0).StoreID}
282+
283+
testutils.SucceedsSoon(t, func() error {
284+
_, supported2 := store2.TestingStoreLivenessSupportManager().SupportFor(n1StoreIdent)
285+
_, supported3 := store3.TestingStoreLivenessSupportManager().SupportFor(n1StoreIdent)
286+
if supported2 || supported3 {
287+
return errors.Errorf("support not yet withdrawn from n1: n2=%t, n3=%t", supported2, supported3)
288+
}
289+
return nil
290+
})
291+
t.Logf("n2 and n3 have withdrawn support from n1")
292+
293+
// We expect n1 to step down as the raft leader because of CheckQuorum and
294+
// leadership to move to either n2 or n3.
295+
var newLeaderIdx int
296+
testutils.SucceedsSoon(t, func() error {
297+
for i := 0; i < numNodes; i++ {
298+
repl := tc.GetFirstStoreFromServer(t, i).LookupReplica(roachpb.RKey(scratchKey))
299+
if repl == nil {
300+
continue
301+
}
302+
rs := repl.RaftStatus()
303+
if rs != nil && rs.RaftState == raftpb.StateLeader {
304+
switch i {
305+
case 0:
306+
return errors.New("n1 is still the raft leader")
307+
default:
308+
newLeaderIdx = i
309+
return nil
310+
}
311+
}
312+
}
313+
return errors.New("no new leader elected yet")
314+
})
315+
t.Logf("n1 stepped down; new raft leader is n%d", newLeaderIdx+1)
316+
317+
// Send a Get request to the new leader's store to trigger lease acquisition.
318+
newLeaderStore := tc.GetFirstStoreFromServer(t, newLeaderIdx)
319+
_, err := newLeaderStore.DB().Get(ctx, scratchKey)
320+
require.NoError(t, err)
321+
t.Logf("sent Get request to n%d to trigger lease acquisition", newLeaderIdx+1)
322+
323+
// Re-enable store liveness heartbeats from n1 so it regains support.
324+
dropMessages.Store(false)
325+
t.Logf("re-enabled store liveness heartbeats from n1")
326+
327+
// Wait for n1 to regain support from n2 and n3.
328+
testutils.SucceedsSoon(t, func() error {
329+
_, supported2 := store2.TestingStoreLivenessSupportManager().SupportFor(n1StoreIdent)
330+
_, supported3 := store3.TestingStoreLivenessSupportManager().SupportFor(n1StoreIdent)
331+
if !supported2 || !supported3 {
332+
return errors.Errorf("n1 has not yet regained support: n2=%t, n3=%t", supported2, supported3)
333+
}
334+
return nil
335+
})
336+
t.Logf("n1 has regained support from n2 and n3")
337+
338+
// Now enqueue the range in the lease queue on the current leaseholder.
339+
// The lease queue should attempt to transfer the lease back to n1, since
340+
// n1 is the preferred leaseholder, but it should only do so if it doesn't
341+
// treat n1 as suspect. Currently, n1 is not treated as suspect if support
342+
// has been recently withdrawn from it, which demonstrates the issue.
343+
repl := newLeaderStore.LookupReplica(roachpb.RKey(scratchKey))
344+
require.NotNil(t, repl)
345+
346+
_, err = newLeaderStore.Enqueue(ctx, "lease", repl, true /* skipShouldQueue */, false /* async */)
347+
require.NoError(t, err)
348+
349+
// TODO(arul): This should be a require.Never followed by a require.Eventually
350+
// once the suspect duration has passed.
351+
require.Eventually(t, func() bool {
352+
repl1 := tc.GetFirstStoreFromServer(t, 0).LookupReplica(roachpb.RKey(scratchKey))
353+
leaseStatus := repl1.CurrentLeaseStatus(ctx)
354+
if !leaseStatus.IsValid() {
355+
return false
356+
}
357+
return leaseStatus.OwnedBy(tc.Target(0).StoreID)
358+
}, 20*time.Second, 100*time.Millisecond)
359+
}

0 commit comments

Comments
 (0)