@@ -8,17 +8,25 @@ package kvserver_test
88import (
99 "context"
1010 "fmt"
11+ "sync/atomic"
1112 "testing"
13+ "time"
1214
1315 "github.com/cockroachdb/cockroach/pkg/base"
16+ "github.com/cockroachdb/cockroach/pkg/config/zonepb"
1417 "github.com/cockroachdb/cockroach/pkg/keys"
1518 "github.com/cockroachdb/cockroach/pkg/kv/kvserver"
19+ slpb "github.com/cockroachdb/cockroach/pkg/kv/kvserver/storeliveness/storelivenesspb"
20+ "github.com/cockroachdb/cockroach/pkg/raft/raftpb"
1621 "github.com/cockroachdb/cockroach/pkg/roachpb"
22+ "github.com/cockroachdb/cockroach/pkg/server"
1723 "github.com/cockroachdb/cockroach/pkg/settings/cluster"
24+ "github.com/cockroachdb/cockroach/pkg/spanconfig"
1825 "github.com/cockroachdb/cockroach/pkg/testutils"
1926 "github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
2027 "github.com/cockroachdb/cockroach/pkg/util/leaktest"
2128 "github.com/cockroachdb/cockroach/pkg/util/log"
29+ "github.com/cockroachdb/errors"
2230 "github.com/stretchr/testify/require"
2331)
2432
@@ -174,3 +182,178 @@ func BenchmarkRaftFortificationEnabledForRangeID(b *testing.B) {
174182 _ = kvserver .RaftFortificationEnabledForRangeID (0.5 , roachpb .RangeID (i ))
175183 }
176184}
185+
186+ // TestLeaseTransferAfterStoreLivenessSupportWithdrawn is a regression test for
187+ // https://github.com/cockroachdb/cockroach/issues/158513. It demonstrates the
188+ // hazard where a lease can be transferred back to a store that has recently had
189+ // its support withdrawn.
190+ func TestLeaseTransferAfterStoreLivenessSupportWithdrawn (t * testing.T ) {
191+ defer leaktest .AfterTest (t )()
192+ defer log .Scope (t ).Close (t )
193+
194+ ctx := context .Background ()
195+ st := cluster .MakeTestingClusterSettings ()
196+ // Ensure we're using leader leases which rely on store liveness.
197+ kvserver .OverrideDefaultLeaseType (ctx , & st .SV , roachpb .LeaseLeader )
198+
199+ const numNodes = 3
200+ // Set up zone config with lease preference on n1 (rack=1).
201+ zcfg := zonepb .DefaultZoneConfig ()
202+ zcfg .LeasePreferences = []zonepb.LeasePreference {
203+ {
204+ Constraints : []zonepb.Constraint {
205+ {
206+ Type : zonepb .Constraint_REQUIRED ,
207+ Key : "rack" ,
208+ Value : "1" ,
209+ },
210+ },
211+ },
212+ }
213+
214+ // Create server args with localities for each node.
215+ serverArgs := make (map [int ]base.TestServerArgs , numNodes )
216+ for i := 0 ; i < numNodes ; i ++ {
217+ serverArgs [i ] = base.TestServerArgs {
218+ Settings : st ,
219+ Locality : roachpb.Locality {
220+ Tiers : []roachpb.Tier {{Key : "rack" , Value : fmt .Sprintf ("%d" , i + 1 )}},
221+ },
222+ Knobs : base.TestingKnobs {
223+ SpanConfig : & spanconfig.TestingKnobs {
224+ ConfigureScratchRange : true ,
225+ },
226+ Server : & server.TestingKnobs {
227+ DefaultZoneConfigOverride : & zcfg ,
228+ },
229+ },
230+ RaftConfig : base.RaftConfig {
231+ // Speed up the test by using faster raft ticks.
232+ RaftTickInterval : 100 * time .Millisecond ,
233+ RaftElectionTimeoutTicks : 5 ,
234+ RaftHeartbeatIntervalTicks : 1 ,
235+ },
236+ }
237+ }
238+
239+ tc := testcluster .StartTestCluster (t , numNodes , base.TestClusterArgs {
240+ ReplicationMode : base .ReplicationManual ,
241+ ServerArgsPerNode : serverArgs ,
242+ })
243+ defer tc .Stopper ().Stop (ctx )
244+
245+ // Setup the test by creating a scratch range with voters on n1, n2, and n3.
246+ // Ensure the lease is on n1 to begin with.
247+ scratchKey := tc .ScratchRange (t )
248+ desc := tc .AddVotersOrFatal (t , scratchKey , tc .Targets (1 , 2 )... )
249+ tc .TransferRangeLeaseOrFatal (t , desc , tc .Target (0 ))
250+ var repl1 * kvserver.Replica
251+ testutils .SucceedsSoon (t , func () error {
252+ repl1 = tc .GetFirstStoreFromServer (t , 0 ).LookupReplica (roachpb .RKey (scratchKey ))
253+ if repl1 == nil {
254+ return errors .New ("replica not found" )
255+ }
256+ leaseStatus := repl1 .CurrentLeaseStatus (ctx )
257+ if ! leaseStatus .IsValid () {
258+ return errors .New ("lease not valid" )
259+ }
260+ if leaseStatus .Lease .Type () != roachpb .LeaseLeader {
261+ return errors .Errorf ("lease type is %s, expected leader lease" , leaseStatus .Lease .Type ())
262+ }
263+ if leaseStatus .Lease .Replica .StoreID != tc .Target (0 ).StoreID {
264+ return errors .Errorf ("lease holder is %d, expected %d" ,
265+ leaseStatus .Lease .Replica .StoreID , tc .Target (0 ).StoreID )
266+ }
267+ return nil
268+ })
269+
270+ // Set up store liveness heartbeat dropping from n1 to n2 and n3, which then
271+ // causes n2 and n3 to withdraw support from n1.
272+ var dropMessages atomic.Bool
273+ dropMessages .Store (true )
274+ dropStoreLivenessHeartbeatsFrom (t , tc .Server (1 ), desc , []roachpb.ReplicaID {1 }, & dropMessages )
275+ dropStoreLivenessHeartbeatsFrom (t , tc .Server (2 ), desc , []roachpb.ReplicaID {1 }, & dropMessages )
276+ t .Logf ("blocking store liveness heartbeats from n1 to n2 and n3" )
277+
278+ // Wait for n2 and n3 to withdraw support from n1.
279+ store2 := tc .GetFirstStoreFromServer (t , 1 )
280+ store3 := tc .GetFirstStoreFromServer (t , 2 )
281+ n1StoreIdent := slpb.StoreIdent {NodeID : tc .Target (0 ).NodeID , StoreID : tc .Target (0 ).StoreID }
282+
283+ testutils .SucceedsSoon (t , func () error {
284+ _ , supported2 := store2 .TestingStoreLivenessSupportManager ().SupportFor (n1StoreIdent )
285+ _ , supported3 := store3 .TestingStoreLivenessSupportManager ().SupportFor (n1StoreIdent )
286+ if supported2 || supported3 {
287+ return errors .Errorf ("support not yet withdrawn from n1: n2=%t, n3=%t" , supported2 , supported3 )
288+ }
289+ return nil
290+ })
291+ t .Logf ("n2 and n3 have withdrawn support from n1" )
292+
293+ // We expect n1 to step down as the raft leader because of CheckQuorum and
294+ // leadership to move to either n2 or n3.
295+ var newLeaderIdx int
296+ testutils .SucceedsSoon (t , func () error {
297+ for i := 0 ; i < numNodes ; i ++ {
298+ repl := tc .GetFirstStoreFromServer (t , i ).LookupReplica (roachpb .RKey (scratchKey ))
299+ if repl == nil {
300+ continue
301+ }
302+ rs := repl .RaftStatus ()
303+ if rs != nil && rs .RaftState == raftpb .StateLeader {
304+ switch i {
305+ case 0 :
306+ return errors .New ("n1 is still the raft leader" )
307+ default :
308+ newLeaderIdx = i
309+ return nil
310+ }
311+ }
312+ }
313+ return errors .New ("no new leader elected yet" )
314+ })
315+ t .Logf ("n1 stepped down; new raft leader is n%d" , newLeaderIdx + 1 )
316+
317+ // Send a Get request to the new leader's store to trigger lease acquisition.
318+ newLeaderStore := tc .GetFirstStoreFromServer (t , newLeaderIdx )
319+ _ , err := newLeaderStore .DB ().Get (ctx , scratchKey )
320+ require .NoError (t , err )
321+ t .Logf ("sent Get request to n%d to trigger lease acquisition" , newLeaderIdx + 1 )
322+
323+ // Re-enable store liveness heartbeats from n1 so it regains support.
324+ dropMessages .Store (false )
325+ t .Logf ("re-enabled store liveness heartbeats from n1" )
326+
327+ // Wait for n1 to regain support from n2 and n3.
328+ testutils .SucceedsSoon (t , func () error {
329+ _ , supported2 := store2 .TestingStoreLivenessSupportManager ().SupportFor (n1StoreIdent )
330+ _ , supported3 := store3 .TestingStoreLivenessSupportManager ().SupportFor (n1StoreIdent )
331+ if ! supported2 || ! supported3 {
332+ return errors .Errorf ("n1 has not yet regained support: n2=%t, n3=%t" , supported2 , supported3 )
333+ }
334+ return nil
335+ })
336+ t .Logf ("n1 has regained support from n2 and n3" )
337+
338+ // Now enqueue the range in the lease queue on the current leaseholder.
339+ // The lease queue should attempt to transfer the lease back to n1, since
340+ // n1 is the preferred leaseholder, but it should only do so if it doesn't
341+ // treat n1 as suspect. Currently, n1 is not treated as suspect if support
342+ // has been recently withdrawn from it, which demonstrates the issue.
343+ repl := newLeaderStore .LookupReplica (roachpb .RKey (scratchKey ))
344+ require .NotNil (t , repl )
345+
346+ _ , err = newLeaderStore .Enqueue (ctx , "lease" , repl , true /* skipShouldQueue */ , false /* async */ )
347+ require .NoError (t , err )
348+
349+ // TODO(arul): This should be a require.Never followed by a require.Eventually
350+ // once the suspect duration has passed.
351+ require .Eventually (t , func () bool {
352+ repl1 := tc .GetFirstStoreFromServer (t , 0 ).LookupReplica (roachpb .RKey (scratchKey ))
353+ leaseStatus := repl1 .CurrentLeaseStatus (ctx )
354+ if ! leaseStatus .IsValid () {
355+ return false
356+ }
357+ return leaseStatus .OwnedBy (tc .Target (0 ).StoreID )
358+ }, 20 * time .Second , 100 * time .Millisecond )
359+ }
0 commit comments