@@ -207,12 +207,12 @@ type ReplicaChange struct {
207207 // replica being demoted cannot retain the lease).
208208 //
209209 // NB: The prev value is always the state before the change. This is the
210- // source of truth provided by the leaseholder in the RangeMsg, so will
211- // have real ReplicaIDs (if already a replica) and real ReplicaTypes
210+ // latest source of truth provided by the leaseholder in the RangeMsg, so
211+ // will have real ReplicaIDs (if already a replica) and real ReplicaTypes
212212 // (including types beyond VOTER_FULL and NON_VOTER). This source-of-truth
213213 // claim is guaranteed by REQUIREMENT(change-computation) documented
214- // elsewhere, and the fact that new changes are computed only when there
215- // are no pending changes for a range.
214+ // elsewhere, and the fact that new changes are computed only when there are
215+ // no pending changes for a range.
216216 //
217217 // The ReplicaType in next is either the zero value (for removals), or
218218 // {VOTER_FULL, NON_VOTER} for additions/change, i.e., it represents the
@@ -221,6 +221,9 @@ type ReplicaChange struct {
221221 // TODO(tbg): in MakeLeaseTransferChanges, next.ReplicaType.ReplicaType is
222222 // simply the current value, and not necessarily {VOTER_FULL, NON_VOTER}.
223223 // So the above comment is incorrect. We should clean this up.
224+ //
225+ // The prev field is mutable after creation, to ensure that an undo restores
226+ // the state to the latest source of truth from the leaseholder.
224227 prev ReplicaState
225228 next ReplicaIDAndType
226229}
@@ -251,6 +254,9 @@ func (rc ReplicaChange) isUpdate() bool {
251254 return changeType == AddLease || changeType == RemoveLease || changeType == ChangeReplica
252255}
253256
257+ // replicaChangeType returns the type of change currently represented by prev
258+ // and next. Since prev is mutable, the result of this method can change over
259+ // time.
254260func (rc ReplicaChange ) replicaChangeType () ReplicaChangeType {
255261 prevExists := replicaExists (rc .prev .ReplicaID )
256262 nextExists := replicaExists (rc .next .ReplicaID )
@@ -630,21 +636,17 @@ type pendingReplicaChange struct {
630636 // expiry. All replica changes in a PendingRangeChange have the same
631637 // startTime.
632638 startTime time.Time
633- // gcTime represents a time when the unenacted change should be GC'd, either
634- // using the normal GC undo path, or if rangeState.pendingChangeNoRollback
635- // is true, when processing a RangeMsg from the leaseholder .
639+ // gcTime represents a time when the unenacted change should be GC'd.
640+ //
641+ // Mutable after creation .
636642 gcTime time.Time
637643
638- // TODO(kvoli,sumeerbhola): Consider adopting an explicit expiration time,
639- // after which the change is considered to have been rejected. This would
640- // allow a different expiration time for different types of changes, e.g.,
641- // lease transfers would have a smaller expiration time than replica
642- // additions.
643-
644644 // When the change is known to be enacted based on the authoritative
645645 // information received from the leaseholder, this value is set, so that even
646646 // if the store with a replica affected by this pending change does not tell
647647 // us about the enactment, we can garbage collect this change.
648+ //
649+ // Mutable after creation.
648650 enactedAtTime time.Time
649651}
650652
@@ -955,7 +957,7 @@ type rangeState struct {
955957 // that are still at the initial state, or an intermediate state, it can
956958 // continue anticipating that these pending changes will happen. Tracking
957959 // what is pending also allows for undo in the case of explicit failure,
958- // notified by AdjustPendingChangesDisposition.
960+ // notified by AdjustPendingChangesDisposition, or GC .
959961 //
960962 // 2. Lifecycle
961963 // pendingChanges track proposed modifications to a range's replicas or
@@ -981,17 +983,9 @@ type rangeState struct {
981983 // has been enacted in this case.
982984 //
983985 // 2. Undone as failed: corresponding replica and load change is rolled back.
984- // Note that for replica changes that originate from one action, all changes
985- // would be undone together.
986- // NB: pending changes of a range state originate from one decision.
987- // Therefore, when one pending change is enacted successfully, we mark this
988- // range state's pending changes as no rollback (read more about this in 3).
989- // If we are here trying to undo a pending change but the range state has
990- // already been marked as no rollback, we do not undo the remaining pending
991- // changes. Instead, we wait for a StoreLeaseholderMsg to discard the pending
992- // changes and revert the load adjustments after the
993- // partiallyEnactedGCDuration has elapsed since the first enacted change. The
994- // modeling here is imperfect (read more about this in 3).
986+ // Note that for replica changes that originate from one action, some changes
987+ // can be considered done because of the leaseholder msg, and others can be
988+ // rolled back (say due to GC).
995989 //
996990 // This happens when:
997991 // - The pending change failed to apply via
@@ -1052,14 +1046,10 @@ type rangeState struct {
10521046 // the replica and leaseholder to s4. An intermediate state that can be
10531047 // observed is {s1, s2, s3, s4} with the lease still at s3. But the pending
10541048 // change for adding s4 includes both that it has a replica, and it has the
1055- // lease, so we will not mark it done, and keep pretending that the whole
1056- // change is pending. Since lease transfers are fast, we accept this
1057- // imperfect modeling fidelity. One consequence of this imperfect modeling
1058- // is that if in this example there are no further changes observed until
1059- // GC, the allocator will undo both changes and go back to the state {s1,
1060- // s2, s3} with s3 as the leaseholder. That is, it has forgotten that s4 was
1061- // added. This is unavoidable and will be fixed by the first
1062- // StoreLeaseholderMsg post-GC.
1049+ // lease, so we will not mark it done, and keep pretending that the change
1050+ // is pending. However, we will change the prev state to indicate that s4
1051+ // has a replica, so that undo (say due to GC) rolls back to the latest
1052+ // source-of-truth from the leaseholder.
10631053 //
10641054 // 4. Non Atomicity Hazard
10651055 //
@@ -1068,20 +1058,19 @@ type rangeState struct {
10681058 // to contend with the hazard of having two leaseholders or no leaseholders.
10691059 // In the earlier example, say s3 and s4 were both local stores (a
10701060 // multi-store node), it may be possible to observe an intermediate state
1071- // {s1, s2, s3, s4} where s4 is the leaseholder. If we subsequently get a
1072- // spurious AdjustPendingChangesDisposition(success=false) call, or
1073- // time-based GC causes the s3 removal to be undone, there will be two
1074- // replicas marked as the leaseholder. The other extreme is believing that
1075- // the s3 transfer is done and the s4 incoming replica (and lease) failed
1076- // (this may not actually be possible because of the surrounding code).
1061+ // {s1, s2, s3, s4} where s4 is the leaseholder. We need to ensure that if
1062+ // we subsequently get a spurious
1063+ // AdjustPendingChangesDisposition(success=false) call, or time-based GC
1064+ // causes the s3 removal to be undone, there will not be two replicas marked
1065+ // as the leaseholder. The other extreme is believing that the s3 transfer
1066+ // is done and the s4 incoming replica (and lease) failed (this may not
1067+ // actually be possible because of the surrounding code).
10771068 //
1078- // We deal with this hazard by observing that we've constructed multiple
1079- // pending changes in order to observe intermediate changes in the common
1080- // case of success. Once one change in the set of changes is considered
1081- // enacted, we mark the whole remaining group as no-rollback. In the above
1082- // case, if we see s4 has become the leaseholder, the s1 removal can't undo
1083- // itself -- it can be dropped if it is considered subsumed when processing
1084- // a RangeMsg, or it can be GC'd.
1069+ // This hazard is dealt with in the same way outlined in the earlier
1070+ // example: when the leaseholder msg from s4 arrives that lists {s1, s2, s3,
1071+ // s4} as replicas, the prev state for the s3 change is updated to indicate
1072+ // that it is not the leaseholder. This means that if the change is undone,
1073+ // it will return to a prev state where it has a replica but not the lease.
10851074 //
10861075 // Additionally, when processing a RangeMsg, if any of the pending changes
10871076 // is considered inconsistent, all the pending changes are discarded. This
@@ -1101,11 +1090,6 @@ type rangeState struct {
11011090 // rangeState.pendingChanges across all ranges in clusterState.ranges will
11021091 // be identical to clusterState.pendingChanges.
11031092 pendingChanges []* pendingReplicaChange
1104- // When set, the pendingChanges can not be rolled back anymore. They have
1105- // to be enacted, or discarded wholesale in favor of the latest RangeMsg
1106- // from the leaseholder. It is reset to false when pendingChanges
1107- // transitions from empty to non-empty.
1108- pendingChangeNoRollback bool
11091093
11101094 // If non-nil, it is up-to-date. Typically, non-nil for a range that has no
11111095 // pendingChanges and is not satisfying some constraint, since we don't want
@@ -1449,27 +1433,15 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
14491433 // The change has been enacted according to the leaseholder.
14501434 enactedChanges = append (enactedChanges , change )
14511435 } else {
1436+ // Not subsumed. Replace the prev with the latest source of truth from
1437+ // the leaseholder. Note, this can be the noReplicaID case from above.
1438+ change .prev = adjustedReplica
14521439 remainingChanges = append (remainingChanges , change )
14531440 }
14541441 }
1455- gcRemainingChanges := false
1456- if rs .pendingChangeNoRollback {
1457- // A previous StoreLeaseholderMsg has enacted some changes, so the
1458- // remainingChanges may be GC'able. All of them share the same GC time.
1459- // Note that normal GC will not GC these, since normal GC needs to undo,
1460- // and we are not allowed to undo these.
1461- if len (remainingChanges ) > 0 {
1462- gcTime := remainingChanges [0 ].gcTime
1463- if gcTime .Before (now ) {
1464- gcRemainingChanges = true
1465- }
1466- }
1467- } else if len (enactedChanges ) > 0 && len (remainingChanges ) > 0 {
1468- // First time this set of changes is seeing something enacted, and there
1469- // are remaining changes.
1442+ if len (enactedChanges ) > 0 && len (remainingChanges ) > 0 {
1443+ // There are remaining changes, so potentially update their gcTime.
14701444 //
1471- // No longer permitted to rollback.
1472- rs .pendingChangeNoRollback = true
14731445 // All remaining changes have the same gcTime.
14741446 curGCTime := remainingChanges [0 ].gcTime
14751447 revisedGCTime := now .Add (partiallyEnactedGCDuration )
@@ -1513,27 +1485,19 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
15131485 // preCheckOnApplyReplicaChanges returns false if there are any pending
15141486 // changes, and these are the changes that are pending. This is hacky
15151487 // and should be cleaned up.
1516- var valid bool
1517- var reason redact.RedactableString
1518- if gcRemainingChanges {
1519- reason = "GCing remaining changes after partial enactment"
1520- } else {
1521- // NB: rs.pendingChanges contains the same changes as
1522- // remainingChanges, but they are not the same slice.
1523- rc := rs .pendingChanges
1524- rs .pendingChanges = nil
1525- err := cs .preCheckOnApplyReplicaChanges (PendingRangeChange {
1526- RangeID : rangeMsg .RangeID ,
1527- pendingReplicaChanges : remainingChanges ,
1528- })
1529- valid = err == nil
1530- if err != nil {
1531- reason = redact .Sprint (err )
1532- }
1533- // Restore it.
1534- rs .pendingChanges = rc
1535- }
1536- if valid {
1488+ //
1489+ // NB: rs.pendingChanges contains the same changes as
1490+ // remainingChanges, but they are not the same slice.
1491+ rc := rs .pendingChanges
1492+ rs .pendingChanges = nil
1493+ err := cs .preCheckOnApplyReplicaChanges (PendingRangeChange {
1494+ RangeID : rangeMsg .RangeID ,
1495+ pendingReplicaChanges : remainingChanges ,
1496+ })
1497+ // Restore it.
1498+ rs .pendingChanges = rc
1499+
1500+ if err == nil {
15371501 // Re-apply the remaining changes. Note that the load change was not
15381502 // undone above, so we pass !applyLoadChange, to avoid applying it
15391503 // again. Also note that applyReplicaChange does not add to the various
@@ -1543,6 +1507,7 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
15431507 cs .applyReplicaChange (change .ReplicaChange , false )
15441508 }
15451509 } else {
1510+ reason := redact .Sprint (err )
15461511 // The current state provided by the leaseholder does not permit these
15471512 // changes, so we need to drop them. This should be rare, but can happen
15481513 // if the leaseholder executed a change that MMA was completely unaware
@@ -1787,7 +1752,6 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
17871752 topk := ss .adjusted .topKRanges [msg .StoreID ]
17881753 topk .doneInit ()
17891754 }
1790-
17911755}
17921756
17931757// If the pending replica change does not happen within this GC duration, we
@@ -1821,15 +1785,6 @@ func (cs *clusterState) gcPendingChanges(now time.Time) {
18211785 if ! ok {
18221786 panic (errors .AssertionFailedf ("range %v not found in cluster state" , rangeID ))
18231787 }
1824-
1825- // Unlike normal GC that reverts changes, we want to discard these pending
1826- // changes. Do nothing here; processStoreLeaseholderMsgInternal will later
1827- // detect and discard these pending changes. Note that
1828- // processStoreLeaseholderMsgInternal will not revert the pending load
1829- // change.
1830- if rs .pendingChangeNoRollback {
1831- continue
1832- }
18331788 if len (rs .pendingChanges ) == 0 {
18341789 panic (errors .AssertionFailedf ("no pending changes in range %v" , rangeID ))
18351790 }
@@ -1871,8 +1826,6 @@ func (cs *clusterState) pendingChangeEnacted(cid changeID, enactedAt time.Time)
18711826}
18721827
18731828// undoPendingChange reverses the change with ID cid.
1874- //
1875- // REQUIRES: the change is not marked as no-rollback.
18761829func (cs * clusterState ) undoPendingChange (cid changeID ) {
18771830 change , ok := cs .pendingChanges [cid ]
18781831 if ! ok {
@@ -1882,10 +1835,6 @@ func (cs *clusterState) undoPendingChange(cid changeID) {
18821835 if ! ok {
18831836 panic (errors .AssertionFailedf ("range %v not found in cluster state" , change .rangeID ))
18841837 }
1885- if rs .pendingChangeNoRollback {
1886- // One cannot undo changes once no-rollback is true.
1887- panic (errors .AssertionFailedf ("pending change is marked as no-rollback" ))
1888- }
18891838 // Wipe the analyzed constraints, as the range has changed.
18901839 rs .clearAnalyzedConstraints ()
18911840 rs .lastFailedChange = cs .ts .Now ()
@@ -1944,9 +1893,8 @@ func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) {
19441893 // Only the lease is being transferred.
19451894 gcDuration = pendingLeaseTransferGCDuration
19461895 }
1947- pendingChanges := change .pendingReplicaChanges
19481896 now := cs .ts .Now ()
1949- for _ , pendingChange := range pendingChanges {
1897+ for _ , pendingChange := range change . pendingReplicaChanges {
19501898 cs .applyReplicaChange (pendingChange .ReplicaChange , true )
19511899 cs .changeSeqGen ++
19521900 cid := cs .changeSeqGen
@@ -1959,11 +1907,9 @@ func (cs *clusterState) addPendingRangeChange(change PendingRangeChange) {
19591907 cs .pendingChanges [cid ] = pendingChange
19601908 storeState .adjusted .loadPendingChanges [cid ] = pendingChange
19611909 rangeState .pendingChanges = append (rangeState .pendingChanges , pendingChange )
1962- rangeState .pendingChangeNoRollback = false
19631910 log .KvDistribution .VEventf (context .Background (), 3 ,
19641911 "addPendingRangeChange: change_id=%v, range_id=%v, change=%v" ,
19651912 cid , rangeID , pendingChange .ReplicaChange )
1966- pendingChanges = append (pendingChanges , pendingChange )
19671913 }
19681914}
19691915
@@ -2031,8 +1977,6 @@ func (cs *clusterState) preCheckOnApplyReplicaChanges(rangeChange PendingRangeCh
20311977// preCheckOnUndoReplicaChanges does some validation of the changes being
20321978// proposed for undo.
20331979//
2034- // REQUIRES: the rangeState.pendingChangeNoRollback is false.
2035- //
20361980// This method is defensive since if we always check against the current state
20371981// before allowing a change to be added (including re-addition after a
20381982// StoreLeaseholderMsg), we should never have invalidity during an undo, if
0 commit comments