Skip to content

Commit f89daab

Browse files
committed
mma: eliminate pendingChangeNoRollback
Instead, the ReplicaChange.prev field is updated to reflect the latest state reported by the leaseholder. In addition to simplifyin the code, it fixes an existing issue where an undo could rollback to a state preceding the latest leaseholder state. Informs #157049 Epic: CRDB-55052 Release note: None
1 parent 076a595 commit f89daab

File tree

7 files changed

+325
-191
lines changed

7 files changed

+325
-191
lines changed

pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -257,18 +257,14 @@ func (a *allocatorState) ProcessStoreLoadMsg(ctx context.Context, msg *StoreLoad
257257
func (a *allocatorState) AdjustPendingChangeDisposition(change ExternalRangeChange, success bool) {
258258
a.mu.Lock()
259259
defer a.mu.Unlock()
260-
rs, ok := a.cs.ranges[change.RangeID]
260+
_, ok := a.cs.ranges[change.RangeID]
261261
if !ok {
262262
// Range no longer exists. This can happen if the StoreLeaseholderMsg
263263
// which included the effect of the change that transferred the lease away
264264
// was already processed, causing the range to no longer be tracked by the
265265
// allocator.
266266
return
267267
}
268-
if !success && rs.pendingChangeNoRollback {
269-
// Not allowed to undo.
270-
return
271-
}
272268
// NB: It is possible that some of the changes have already been enacted via
273269
// StoreLeaseholderMsg, and even been garbage collected. So no assumption
274270
// can be made about whether these changes will be found in the allocator's

pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go

Lines changed: 55 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -207,12 +207,12 @@ type ReplicaChange struct {
207207
// replica being demoted cannot retain the lease).
208208
//
209209
// NB: The prev value is always the state before the change. This is the
210-
// source of truth provided by the leaseholder in the RangeMsg, so will
211-
// have real ReplicaIDs (if already a replica) and real ReplicaTypes
210+
// latest source of truth provided by the leaseholder in the RangeMsg, so
211+
// will have real ReplicaIDs (if already a replica) and real ReplicaTypes
212212
// (including types beyond VOTER_FULL and NON_VOTER). This source-of-truth
213213
// claim is guaranteed by REQUIREMENT(change-computation) documented
214-
// elsewhere, and the fact that new changes are computed only when there
215-
// are no pending changes for a range.
214+
// elsewhere, and the fact that new changes are computed only when there are
215+
// no pending changes for a range.
216216
//
217217
// The ReplicaType in next is either the zero value (for removals), or
218218
// {VOTER_FULL, NON_VOTER} for additions/change, i.e., it represents the
@@ -221,6 +221,9 @@ type ReplicaChange struct {
221221
// TODO(tbg): in MakeLeaseTransferChanges, next.ReplicaType.ReplicaType is
222222
// simply the current value, and not necessarily {VOTER_FULL, NON_VOTER}.
223223
// So the above comment is incorrect. We should clean this up.
224+
//
225+
// The prev field is mutable after creation, to ensure that an undo restores
226+
// the state to the latest source of truth from the leaseholder.
224227
prev ReplicaState
225228
next ReplicaIDAndType
226229
}
@@ -251,6 +254,9 @@ func (rc ReplicaChange) isUpdate() bool {
251254
return changeType == AddLease || changeType == RemoveLease || changeType == ChangeReplica
252255
}
253256

257+
// replicaChangeType returns the type of change currently represented by prev
258+
// and next. Since prev is mutable, the result of this method can change over
259+
// time.
254260
func (rc ReplicaChange) replicaChangeType() ReplicaChangeType {
255261
prevExists := replicaExists(rc.prev.ReplicaID)
256262
nextExists := replicaExists(rc.next.ReplicaID)
@@ -630,21 +636,17 @@ type pendingReplicaChange struct {
630636
// expiry. All replica changes in a PendingRangeChange have the same
631637
// startTime.
632638
startTime time.Time
633-
// gcTime represents a time when the unenacted change should be GC'd, either
634-
// using the normal GC undo path, or if rangeState.pendingChangeNoRollback
635-
// is true, when processing a RangeMsg from the leaseholder.
639+
// gcTime represents a time when the unenacted change should be GC'd.
640+
//
641+
// Mutable after creation.
636642
gcTime time.Time
637643

638-
// TODO(kvoli,sumeerbhola): Consider adopting an explicit expiration time,
639-
// after which the change is considered to have been rejected. This would
640-
// allow a different expiration time for different types of changes, e.g.,
641-
// lease transfers would have a smaller expiration time than replica
642-
// additions.
643-
644644
// When the change is known to be enacted based on the authoritative
645645
// information received from the leaseholder, this value is set, so that even
646646
// if the store with a replica affected by this pending change does not tell
647647
// us about the enactment, we can garbage collect this change.
648+
//
649+
// Mutable after creation.
648650
enactedAtTime time.Time
649651
}
650652

@@ -955,7 +957,7 @@ type rangeState struct {
955957
// that are still at the initial state, or an intermediate state, it can
956958
// continue anticipating that these pending changes will happen. Tracking
957959
// what is pending also allows for undo in the case of explicit failure,
958-
// notified by AdjustPendingChangesDisposition.
960+
// notified by AdjustPendingChangesDisposition, or GC.
959961
//
960962
// 2. Lifecycle
961963
// pendingChanges track proposed modifications to a range's replicas or
@@ -981,17 +983,9 @@ type rangeState struct {
981983
// has been enacted in this case.
982984
//
983985
// 2. Undone as failed: corresponding replica and load change is rolled back.
984-
// Note that for replica changes that originate from one action, all changes
985-
// would be undone together.
986-
// NB: pending changes of a range state originate from one decision.
987-
// Therefore, when one pending change is enacted successfully, we mark this
988-
// range state's pending changes as no rollback (read more about this in 3).
989-
// If we are here trying to undo a pending change but the range state has
990-
// already been marked as no rollback, we do not undo the remaining pending
991-
// changes. Instead, we wait for a StoreLeaseholderMsg to discard the pending
992-
// changes and revert the load adjustments after the
993-
// partiallyEnactedGCDuration has elapsed since the first enacted change. The
994-
// modeling here is imperfect (read more about this in 3).
986+
// Note that for replica changes that originate from one action, some changes
987+
// can be considered done because of the leaseholder msg, and others can be
988+
// rolled back (say due to GC).
995989
//
996990
// This happens when:
997991
// - The pending change failed to apply via
@@ -1052,14 +1046,10 @@ type rangeState struct {
10521046
// the replica and leaseholder to s4. An intermediate state that can be
10531047
// observed is {s1, s2, s3, s4} with the lease still at s3. But the pending
10541048
// change for adding s4 includes both that it has a replica, and it has the
1055-
// lease, so we will not mark it done, and keep pretending that the whole
1056-
// change is pending. Since lease transfers are fast, we accept this
1057-
// imperfect modeling fidelity. One consequence of this imperfect modeling
1058-
// is that if in this example there are no further changes observed until
1059-
// GC, the allocator will undo both changes and go back to the state {s1,
1060-
// s2, s3} with s3 as the leaseholder. That is, it has forgotten that s4 was
1061-
// added. This is unavoidable and will be fixed by the first
1062-
// StoreLeaseholderMsg post-GC.
1049+
// lease, so we will not mark it done, and keep pretending that the change
1050+
// is pending. However, we will change the prev state to indicate that s4
1051+
// has a replica, so that undo (say due to GC) rolls back to the latest
1052+
// source-of-truth from the leaseholder.
10631053
//
10641054
// 4. Non Atomicity Hazard
10651055
//
@@ -1068,20 +1058,19 @@ type rangeState struct {
10681058
// to contend with the hazard of having two leaseholders or no leaseholders.
10691059
// In the earlier example, say s3 and s4 were both local stores (a
10701060
// multi-store node), it may be possible to observe an intermediate state
1071-
// {s1, s2, s3, s4} where s4 is the leaseholder. If we subsequently get a
1072-
// spurious AdjustPendingChangesDisposition(success=false) call, or
1073-
// time-based GC causes the s3 removal to be undone, there will be two
1074-
// replicas marked as the leaseholder. The other extreme is believing that
1075-
// the s3 transfer is done and the s4 incoming replica (and lease) failed
1076-
// (this may not actually be possible because of the surrounding code).
1061+
// {s1, s2, s3, s4} where s4 is the leaseholder. We need to ensure that if
1062+
// we subsequently get a spurious
1063+
// AdjustPendingChangesDisposition(success=false) call, or time-based GC
1064+
// causes the s3 removal to be undone, there will not be two replicas marked
1065+
// as the leaseholder. The other extreme is believing that the s3 transfer
1066+
// is done and the s4 incoming replica (and lease) failed (this may not
1067+
// actually be possible because of the surrounding code).
10771068
//
1078-
// We deal with this hazard by observing that we've constructed multiple
1079-
// pending changes in order to observe intermediate changes in the common
1080-
// case of success. Once one change in the set of changes is considered
1081-
// enacted, we mark the whole remaining group as no-rollback. In the above
1082-
// case, if we see s4 has become the leaseholder, the s1 removal can't undo
1083-
// itself -- it can be dropped if it is considered subsumed when processing
1084-
// a RangeMsg, or it can be GC'd.
1069+
// This hazard is dealt with in the same way outlined in the earlier
1070+
// example: when the leaseholder msg from s4 arrives that lists {s1, s2, s3,
1071+
// s4} as replicas, the prev state for the s3 change is updated to indicate
1072+
// that it is not the leaseholder. This means that if the change is undone,
1073+
// it will return to a prev state where it has a replica but not the lease.
10851074
//
10861075
// Additionally, when processing a RangeMsg, if any of the pending changes
10871076
// is considered inconsistent, all the pending changes are discarded. This
@@ -1101,11 +1090,6 @@ type rangeState struct {
11011090
// rangeState.pendingChanges across all ranges in clusterState.ranges will
11021091
// be identical to clusterState.pendingChanges.
11031092
pendingChanges []*pendingReplicaChange
1104-
// When set, the pendingChanges can not be rolled back anymore. They have
1105-
// to be enacted, or discarded wholesale in favor of the latest RangeMsg
1106-
// from the leaseholder. It is reset to false when pendingChanges
1107-
// transitions from empty to non-empty.
1108-
pendingChangeNoRollback bool
11091093

11101094
// If non-nil, it is up-to-date. Typically, non-nil for a range that has no
11111095
// pendingChanges and is not satisfying some constraint, since we don't want
@@ -1449,27 +1433,15 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
14491433
// The change has been enacted according to the leaseholder.
14501434
enactedChanges = append(enactedChanges, change)
14511435
} else {
1436+
// Not subsumed. Replace the prev with the latest source of truth from
1437+
// the leaseholder. Note, this can be the noReplicaID case from above.
1438+
change.prev = adjustedReplica
14521439
remainingChanges = append(remainingChanges, change)
14531440
}
14541441
}
1455-
gcRemainingChanges := false
1456-
if rs.pendingChangeNoRollback {
1457-
// A previous StoreLeaseholderMsg has enacted some changes, so the
1458-
// remainingChanges may be GC'able. All of them share the same GC time.
1459-
// Note that normal GC will not GC these, since normal GC needs to undo,
1460-
// and we are not allowed to undo these.
1461-
if len(remainingChanges) > 0 {
1462-
gcTime := remainingChanges[0].gcTime
1463-
if gcTime.Before(now) {
1464-
gcRemainingChanges = true
1465-
}
1466-
}
1467-
} else if len(enactedChanges) > 0 && len(remainingChanges) > 0 {
1468-
// First time this set of changes is seeing something enacted, and there
1469-
// are remaining changes.
1442+
if len(enactedChanges) > 0 && len(remainingChanges) > 0 {
1443+
// There are remaining changes, so potentially update their gcTime.
14701444
//
1471-
// No longer permitted to rollback.
1472-
rs.pendingChangeNoRollback = true
14731445
// All remaining changes have the same gcTime.
14741446
curGCTime := remainingChanges[0].gcTime
14751447
revisedGCTime := now.Add(partiallyEnactedGCDuration)
@@ -1513,27 +1485,19 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
15131485
// preCheckOnApplyReplicaChanges returns false if there are any pending
15141486
// changes, and these are the changes that are pending. This is hacky
15151487
// and should be cleaned up.
1516-
var valid bool
1517-
var reason redact.RedactableString
1518-
if gcRemainingChanges {
1519-
reason = "GCing remaining changes after partial enactment"
1520-
} else {
1521-
// NB: rs.pendingChanges contains the same changes as
1522-
// remainingChanges, but they are not the same slice.
1523-
rc := rs.pendingChanges
1524-
rs.pendingChanges = nil
1525-
err := cs.preCheckOnApplyReplicaChanges(PendingRangeChange{
1526-
RangeID: rangeMsg.RangeID,
1527-
pendingReplicaChanges: remainingChanges,
1528-
})
1529-
valid = err == nil
1530-
if err != nil {
1531-
reason = redact.Sprint(err)
1532-
}
1533-
// Restore it.
1534-
rs.pendingChanges = rc
1535-
}
1536-
if valid {
1488+
//
1489+
// NB: rs.pendingChanges contains the same changes as
1490+
// remainingChanges, but they are not the same slice.
1491+
rc := rs.pendingChanges
1492+
rs.pendingChanges = nil
1493+
err := cs.preCheckOnApplyReplicaChanges(PendingRangeChange{
1494+
RangeID: rangeMsg.RangeID,
1495+
pendingReplicaChanges: remainingChanges,
1496+
})
1497+
// Restore it.
1498+
rs.pendingChanges = rc
1499+
1500+
if err == nil {
15371501
// Re-apply the remaining changes. Note that the load change was not
15381502
// undone above, so we pass !applyLoadChange, to avoid applying it
15391503
// again. Also note that applyReplicaChange does not add to the various
@@ -1543,6 +1507,7 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
15431507
cs.applyReplicaChange(change.ReplicaChange, false)
15441508
}
15451509
} else {
1510+
reason := redact.Sprint(err)
15461511
// The current state provided by the leaseholder does not permit these
15471512
// changes, so we need to drop them. This should be rare, but can happen
15481513
// if the leaseholder executed a change that MMA was completely unaware
@@ -1787,7 +1752,6 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
17871752
topk := ss.adjusted.topKRanges[msg.StoreID]
17881753
topk.doneInit()
17891754
}
1790-
17911755
}
17921756

17931757
// If the pending replica change does not happen within this GC duration, we
@@ -1821,15 +1785,6 @@ func (cs *clusterState) gcPendingChanges(now time.Time) {
18211785
if !ok {
18221786
panic(errors.AssertionFailedf("range %v not found in cluster state", rangeID))
18231787
}
1824-
1825-
// Unlike normal GC that reverts changes, we want to discard these pending
1826-
// changes. Do nothing here; processStoreLeaseholderMsgInternal will later
1827-
// detect and discard these pending changes. Note that
1828-
// processStoreLeaseholderMsgInternal will not revert the pending load
1829-
// change.
1830-
if rs.pendingChangeNoRollback {
1831-
continue
1832-
}
18331788
if len(rs.pendingChanges) == 0 {
18341789
panic(errors.AssertionFailedf("no pending changes in range %v", rangeID))
18351790
}
@@ -1871,8 +1826,6 @@ func (cs *clusterState) pendingChangeEnacted(cid changeID, enactedAt time.Time)
18711826
}
18721827

18731828
// undoPendingChange reverses the change with ID cid.
1874-
//
1875-
// REQUIRES: the change is not marked as no-rollback.
18761829
func (cs *clusterState) undoPendingChange(cid changeID) {
18771830
change, ok := cs.pendingChanges[cid]
18781831
if !ok {
@@ -1882,10 +1835,6 @@ func (cs *clusterState) undoPendingChange(cid changeID) {
18821835
if !ok {
18831836
panic(errors.AssertionFailedf("range %v not found in cluster state", change.rangeID))
18841837
}
1885-
if rs.pendingChangeNoRollback {
1886-
// One cannot undo changes once no-rollback is true.
1887-
panic(errors.AssertionFailedf("pending change is marked as no-rollback"))
1888-
}
18891838
// Wipe the analyzed constraints, as the range has changed.
18901839
rs.clearAnalyzedConstraints()
18911840
rs.lastFailedChange = cs.ts.Now()
@@ -1944,9 +1893,8 @@ func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) {
19441893
// Only the lease is being transferred.
19451894
gcDuration = pendingLeaseTransferGCDuration
19461895
}
1947-
pendingChanges := change.pendingReplicaChanges
19481896
now := cs.ts.Now()
1949-
for _, pendingChange := range pendingChanges {
1897+
for _, pendingChange := range change.pendingReplicaChanges {
19501898
cs.applyReplicaChange(pendingChange.ReplicaChange, true)
19511899
cs.changeSeqGen++
19521900
cid := cs.changeSeqGen
@@ -1959,11 +1907,9 @@ func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) {
19591907
cs.pendingChanges[cid] = pendingChange
19601908
storeState.adjusted.loadPendingChanges[cid] = pendingChange
19611909
rangeState.pendingChanges = append(rangeState.pendingChanges, pendingChange)
1962-
rangeState.pendingChangeNoRollback = false
19631910
log.KvDistribution.VEventf(context.Background(), 3,
19641911
"addPendingRangeChange: change_id=%v, range_id=%v, change=%v",
19651912
cid, rangeID, pendingChange.ReplicaChange)
1966-
pendingChanges = append(pendingChanges, pendingChange)
19671913
}
19681914
}
19691915

@@ -2031,8 +1977,6 @@ func (cs *clusterState) preCheckOnApplyReplicaChanges(rangeChange PendingRangeCh
20311977
// preCheckOnUndoReplicaChanges does some validation of the changes being
20321978
// proposed for undo.
20331979
//
2034-
// REQUIRES: the rangeState.pendingChangeNoRollback is false.
2035-
//
20361980
// This method is defensive since if we always check against the current state
20371981
// before allowing a change to be added (including re-addition after a
20381982
// StoreLeaseholderMsg), we should never have invalidity during an undo, if

pkg/kv/kvserver/allocator/mmaprototype/range_change.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@ import (
1515
// ExternalRangeChange is a proposed set of change(s) to a range. It can
1616
// consist of multiple replica changes, such as adding or removing replicas,
1717
// or transferring the lease. There is at most one change per store in the
18-
// set.
18+
// set. It is immutable after creation.
19+
//
20+
// It is a partial external representation of a set of changes that are
21+
// internally modeled using a slice of *pendingReplicaChanges.
1922
type ExternalRangeChange struct {
2023
roachpb.RangeID
2124
Changes []ExternalReplicaChange
@@ -24,6 +27,12 @@ type ExternalRangeChange struct {
2427
// ExternalReplicaChange is a proposed change to a single replica. Some
2528
// external entity (the leaseholder of the range) may choose to enact this
2629
// change.
30+
//
31+
// It is a partial external representation of a pendingReplicaChange that is
32+
// internal to MMA. While pendingReplicaChange has fields that are mutable,
33+
// since partial changes can be observed to a store, the ExternalReplicaChange
34+
// is immutable and represents the original before and after state of the
35+
// change.
2736
type ExternalReplicaChange struct {
2837
changeID
2938
// Target is the target {store, node} for the change.

pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_replica.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ store-leaseholder-msg
9292
store-id=1
9393
range-id=1 load=[80,80,80] raft-cpu=20
9494
config=(num_replicas=3 constraints={'+region=us-west-1:1'} voter_constraints={'+region=us-west-1:1'})
95-
store-id=1 replica-id=2 type=VOTER_FULL leaseholder=true
95+
store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true
9696
store-id=2 replica-id=2 type=VOTER_FULL
9797
----
9898

@@ -101,7 +101,7 @@ get-pending-changes
101101
----
102102
pending(2)
103103
change-id=1 store-id=2 node-id=2 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s
104-
prev=(replica-id=none type=VOTER_FULL)
104+
prev=(replica-id=2 type=VOTER_FULL)
105105
next=(replica-id=unknown type=VOTER_FULL leaseholder=true)
106106
change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s
107107
prev=(replica-id=1 type=VOTER_FULL leaseholder=true)
@@ -122,7 +122,7 @@ get-pending-changes
122122
----
123123
pending(2)
124124
change-id=1 store-id=2 node-id=2 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s
125-
prev=(replica-id=none type=VOTER_FULL)
125+
prev=(replica-id=2 type=VOTER_FULL)
126126
next=(replica-id=unknown type=VOTER_FULL leaseholder=true)
127127
change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s enacted=5s
128128
prev=(replica-id=1 type=VOTER_FULL leaseholder=true)
@@ -162,7 +162,7 @@ get-pending-changes
162162
----
163163
pending(2)
164164
change-id=1 store-id=2 node-id=2 range-id=1 load-delta=[cpu:88, write-bandwidth:88, byte-size:88] start=0s gc=5m0s enacted=5s
165-
prev=(replica-id=none type=VOTER_FULL)
165+
prev=(replica-id=2 type=VOTER_FULL)
166166
next=(replica-id=unknown type=VOTER_FULL leaseholder=true)
167167
change-id=2 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-80, write-bandwidth:-80, byte-size:-80] start=0s gc=5m0s enacted=5s
168168
prev=(replica-id=1 type=VOTER_FULL leaseholder=true)

0 commit comments

Comments
 (0)