@@ -669,6 +669,20 @@ type analyzedConstraints struct {
669669 // slices are also empty.
670670 constraints []internedConstraintsConjunction
671671
672+ // satisfiedByReplica[kind][i] contains the set of storeIDs that satisfy
673+ // constraints[i]. These are for stores that satisfy at least one
674+ // constraint. Each store should appear exactly once in the 2D slice
675+ // ac.satisfiedByReplica[kind]. For example,
676+ // satisfiedByReplica[voterIndex][0] = [1, 2, 3], means that the first
677+ // constraint (constraints[0]) is satisfied by storeIDs 1, 2, and 3.
678+ //
679+ // NB: satisfiedByReplica[nonVoterIndex][i] is populated even if this
680+ // analyzedConstraints represents a voter constraint. This is done because
681+ // satisfiedByReplica[voterIndex][i] may not sufficiently satisfy a
682+ // constraint (or even if it does, at a later point one could be
683+ // decommissioning one of the voters), and populating the nonVoterIndex
684+ // allows MMA to decide to promote a non-voter.
685+ //
672686 // Overlapping conjunctions: There is nothing preventing overlapping
673687 // ConstraintsConjunctions such that the same store can satisfy multiple,
674688 // though we expect this to be uncommon. This is algorithmically painful
@@ -691,25 +705,30 @@ type analyzedConstraints struct {
691705 // greedy manner instead of considering all possibilities. So all the
692706 // satisfiedBy slices represent sets that are non-intersecting.
693707 //
694- // satisfiedByReplica[kind][i] contains the set of storeIDs that satisfy
695- // constraints[i]. Populated by initialize and used later by mmma to
696- // determine candidates to satisfy constraints such as
697- // candidatesToReplaceVoterForRebalance.
698- // For example, satisfiedByReplica[voterIndex][0] = [1, 2, 3], means that
699- // the first constraint (constraints[0]) is satisfied by storeIDs 1, 2, and
700- // 3.
701- //
702- // NB: this does not mean the current replica set satisfies constraints[i]
703- // since we may populate satisfiedByReplica[nonVoterIndex][i] for voter
704- // constraints as well. This just means that the store can satisfy
705- // constraints[i] regardless of the replica type.
708+ // Example on why ordering from most strict to least strict is important:
709+ // If the constraints were [region=a,zone=a1:1],[region=a:1] (which is
710+ // sorted in decreasing strictness), a voter store in (region=a,zone=a1)
711+ // would satisfy both, but it would be present only in
712+ // satisfiedByReplica[voterIndex][i] for i=0, not i=1. If the user specified
713+ // the constraints in irregular order, [region=a]:1,[region=a,zone=a1]:1,
714+ // the voter would only be present in satisfiedByReplica[voterIndex][0] as
715+ // well (which now refers to the relaxed constraint), and as a consequence
716+ // we would consider (region=a,zone=a1) unfulfilled by this voter. If the
717+ // full constraints conjunction had been [region=a]:1, [region=a,zone=a1]:1,
718+ // [region=a,zone=a2]:1, and there were voters in that region in zones a1,
719+ // a2, and a3, we could end up satisfying [region=a]:1 with the voter in a1,
720+ // but then wouldn't consider it for the stricter constraint
721+ // [region=a,zone=a1]:1 which no other voter can satisfy. The constraints
722+ // would then appear unsatisfiable to the allocator.
723+
706724 satisfiedByReplica [numReplicaKinds ][][]roachpb.StoreID
707725
708- // These are stores that satisfy no constraint. Even though we are strict
709- // about constraint satisfaction, this can happen if the SpanConfig changed
710- // or the attributes of a store changed. Additionally, if these
711- // analyzedConstraints correspond to voterConstraints, there can be
712- // non-voters here (which is harmless).
726+ // ac.satisfiedNoConstraintReplica[kind] contains the set of storeIDs that
727+ // satisfy no constraint. Even though we are strict about constraint
728+ // satisfaction, this can happen if the SpanConfig changed or the attributes
729+ // of a store changed. Note that if these analyzedConstraints correspond to
730+ // voterConstraints, there can be non-voters here which is never used but
731+ // harmless.
713732 satisfiedNoConstraintReplica [numReplicaKinds ][]roachpb.StoreID
714733}
715734
@@ -744,8 +763,12 @@ func clear2DSlice[T any](v [][]T) [][]T {
744763 return v
745764}
746765
747- // rangeAnalyzedConstraints is a function of the spanConfig and the current
748- // stores that have replicas for that range (including the ReplicaType).
766+ // rangeAnalyzedConstraints represents the analyzed constraint state for a
767+ // range, derived from the range's span config and current replica placement
768+ // (including ReplicaType). It contains information used to handle constraint
769+ // satisfaction as part of allocation decisions (e.g. helping identify candidate
770+ // stores for range rebalancing, lease transfers, up-replication,
771+ // down-replication, validating whether constraints are satisfied).
749772//
750773// LEARNER and VOTER_DEMOTING_LEARNER replicas are ignored.
751774type rangeAnalyzedConstraints struct {
@@ -842,68 +865,39 @@ type storeMatchesConstraintInterface interface {
842865 storeMatches (storeID roachpb.StoreID , constraintConj constraintsConj ) bool
843866}
844867
845- // initialize analyzes the current replica set and determines which constraints
846- // replicas satisfy, populating ac.constraints, ac.satisfiedByReplica and
847- // ac.satisfiedNoConstraintReplica. They are later used by mma to compute
848- // lease-transfer and rebalancing candidates by functions like
849- // candidatesToReplaceVoterForRebalance. The given buf.replicas is already
850- // populated with the current replica set from buf.tryAddingStore.
851- //
852- // For stores that satisfy no constraint,
853- // ac.satisfiedNoConstraintReplica[kind][i] will be populated with the replica
854- // storeIDs that satisfy no constraint.
855- //
856- // For stores that satisfy at least one constraint,
857- // ac.satisfiedByReplica[kind][i] is filled with the storeIDs of replicas that
858- // satisfy ac.constraints[i]. Each store should appear exactly once in the 2D
859- // slice ac.satisfiedByReplica[kind]. Phase 2 below attempts to be optimal by
860- // prioritizing under-satisfied constraints and assigning stores to those first.
861- // A constraint may end up being over-satisfied
862- // (len(ac.satisfiedByReplica[kind][i]) > ac.constraints[i].numReplicas) in
863- // phase 3. A constraint may remain under-satisfied. If a store is
864- // under-satisfied in phase 2, it cannot be corrected in phase 3 and will
865- // continue to be under-satisfied. (TODO: Is this right during review)
866- //
867- // NB: the given ac.constraints can represent either replica or voter
868- // constraints, but we still populate both voter and non-voter indices
869- // regardless because we need to know which constraints are satisfied by all
870- // replicas to determine when promotions or demotions should occur.
868+ // initialize takes in the constraints, current replica set, and a constraint
869+ // matcher. It populates ac.constraints, ac.satisfiedByReplica and
870+ // ac.satisfiedNoConstraintReplica by analyzing the current replica set and the
871+ // constraints replicas satisfy. The given buf.replicas is already populated
872+ // with the current replica set from buf.tryAddingStore.
871873//
872- // It has three phases: 1. For each replica (voter and non-voter), determine
873- // which constraint indices in ac.constraints its store satisfies. We record
874- // these matches in buf.replicaConstraintIndices by iterating over all replicas,
875- // all constraint conjunctions, and checking store satisfaction for each using
876- // constraintMatcher.storeMatches.
874+ // The algorithm proceeds in 3 phases, with the description of each phase
875+ // outlined below at the start of each phase.
877876//
878- // buf.replicaConstraintIndices[kind][i] will be populated with the constraint
879- // indices that the i+1-th voter/non-voter satisfies. This serves as a scratch
880- // space. Note that as we assign constraint to stores that satisfy it, we will
881- // be clearing the constraint indices for that store in
882- // buf.replicaConstraintIndices[kind][i] to indicate that we cannot reuse the
883- // store to satisfy a different constraint .
877+ // NB: ac.initialize guarantees to assign exactly one constraint in
878+ // ac.satisfiedByReplica to each store that satisfies at least one constraint.
879+ // But constraints may remain under-satisfied. If a constraint remains
880+ // under-satisfied after phase 2, it cannot be corrected in phase 3 and remain
881+ // to be under-satisfied. This is not essential to correctness but just happens
882+ // to be true of the algorithm .
884883//
885- // During this phase, it also populates ac.satisfiedByReplica[kind][i] for
886- // stores that satisfy exactly one constraint and populates
887- // ac.satisfiedNoConstraintReplica[kind][i] for stores that satisfy no
888- // constraint. Since we will be assigning at least one constraint to each store,
889- // these stores are unambiguous.
890- //
891- // 2. For each constraint, iterate over all replicas that satisfy it and assign
892- // replicas to the constraint in a round-robin manner, skipping once the
893- // constraint is satisfied. This phase does not allow over-satisfaction.
894884// TODO(wenyihu6): Add an example for phase 2 - if s1 satisfies c1(num=1) and s2
895885// satisfies both c1(num=1) and c2(num=1), we should prefer assigning s2 to c2
896- // rather than consuming it again for c1. TODO(wenyihu6): Verify behavior in
897- // cases like: s1 satisfies c1(num=1) and c2(num=1), s2 satisfies only c1, and
898- // s1 is used for c1 and c2 cannot be satisfied although we could have.
886+ // rather than consuming it again for c1.
899887//
900- // 3. For any remaining stores that satisfy multiple constraints, assign each to
901- // the first constraint it satisfies. This phase allows over-satisfaction, and
902- // some constraints may still end up under- or over-filled.
903-
904- // TODO(wenyihu6): document the lifecycle of scratch space
905- // replicaConstraintIndices once understood. TODO(wenyihu6): add more tests +
906- // examples here
888+ // NB: Note that buf.replicaConstraintIndices serves as a scratch space to
889+ // reduce memory allocation and is expected to be empty at the start of every
890+ // analyzedConstraints.initialize call and cleared at the end of the call. For
891+ // every store that satisfies a constraint, we will be clearing the constraint
892+ // indices for that store in buf.replicaConstraintIndices[kind][i] once we have
893+ // assigned it to a constraint in ac.satisfiedByReplica[kind][i]. Since we
894+ // guarantee that each store that satisfies a constraint is assigned to exactly
895+ // one constraint, buf.replicaConstraintIndices will be an empty 2D slice as
896+ // part of the algorithm. In addition, clearing the constraint indices for that
897+ // store in buf.replicaConstraintIndices is also important to help us indicates
898+ // that the store cannot be reused to satisfy a different constraint.
899+ //
900+ // TODO(wenyihu6): add more tests + examples here
907901func (ac * analyzedConstraints ) initialize (
908902 constraints []internedConstraintsConjunction ,
909903 buf * analyzeConstraintsBuf ,
@@ -918,6 +912,16 @@ func (ac *analyzedConstraints) initialize(
918912 ac .satisfiedByReplica [voterIndex ] = extend2DSlice (ac .satisfiedByReplica [voterIndex ])
919913 ac .satisfiedByReplica [nonVoterIndex ] = extend2DSlice (ac .satisfiedByReplica [nonVoterIndex ])
920914 }
915+ // In phase 1, for each replica (voter and non-voter), determine which
916+ // constraint indices in ac.constraints its store satisfies. We record these
917+ // matches in buf.replicaConstraintIndices by iterating over all replicas,
918+ // all constraint conjunctions, and checking store satisfaction for each
919+ // using constraintMatcher.storeMatches. In addition, it also populates
920+ // ac.satisfiedNoConstraintReplica[kind][i] for stores that satisfy no
921+ // constraint and ac.satisfiedByReplica[kind][i] for stores that satisfy
922+ // exactly one constraint. Since we will be assigning at least one
923+ // constraint to each store, these stores are unambiguous.
924+ //
921925 // Compute the list of all constraints satisfied by each store.
922926 for kind := voterIndex ; kind < numReplicaKinds ; kind ++ {
923927 for i , store := range buf .replicas [kind ] {
@@ -952,6 +956,12 @@ func (ac *analyzedConstraints) initialize(
952956 len (ac.satisfiedByReplica [nonVoterIndex ][constraintIndex ]) >= int (ac .constraints [constraintIndex ].numReplicas )
953957 }
954958
959+ // In phase 2, for each under-satisfied constraint, iterate through all replicas
960+ // and assign them to the first store that meets the constraint. Continue until
961+ // the constraint becomes satisfied, then move on to the next one. Skip further
962+ // matching once a constraint is fulfilled. Note that this phase does not allow
963+ // over-satisfaction.
964+ //
955965 // The only stores not yet in ac are the ones that satisfy multiple
956966 // constraints. For each store, the constraint indices it satisfies are in
957967 // increasing order. Satisfy constraints in order, while not
@@ -975,11 +985,6 @@ func (ac *analyzedConstraints) initialize(
975985 }
976986 }
977987
978- // NB: We check the `satisfied` flag directly instead of calling
979- // `isConstraintSatisfied` again, since this value should be set
980- // in the previous loop correctly when the constraint becomes
981- // satisfied.
982- //
983988 // Check if constraint is now satisfied to avoid
984989 // over-satisfaction.
985990 if satisfied {
@@ -992,6 +997,12 @@ func (ac *analyzedConstraints) initialize(
992997 }
993998 }
994999 }
1000+ // In phase 3, for each remaining store that satisfies >1 constraints,
1001+ // assign each to the first constraint it satisfies. This phase allows
1002+ // over-satisfaction
1003+ // (len(ac.satisfiedByReplica[voterIndex][i]+len(ac.satisfiedByReplica[nonVoterIndex][i]))
1004+ // > ac.constraints[i].numReplicas).
1005+ //
9951006 // Nothing over-satisfied. Go and greedily assign.
9961007 for kind := voterIndex ; kind < numReplicaKinds ; kind ++ {
9971008 for i := range buf .replicaConstraintIndices [kind ] {
@@ -1036,20 +1047,16 @@ func diversityOfTwoStoreSets(
10361047 return sumScore , numSamples
10371048}
10381049
1039- // diversityScore returns the voter and replica diversity scores of the given replicas sets.
1040- //
1041- // voterDiversityScore: sum of diversity scores over all pairs from V×V(voter-voter pairs).
1042- // replicaDiversityScore: sum of diversity scores over all pairs from:
1043- // V×V(voter–voter pairs), N×N(non-voter–non-voter pairs), V×N(voter–non-voter
1044- // pairs).
1045- //
1046- // diversity score of a single pair is computed using
1047- // localityTiers.diversityScore which finds the longest common prefix of two
1048- // localities.
1050+ // diversityScore measures how geographically spread out replicas are across the
1051+ // cluster. Higher scores indicate better fault tolerance because replicas are
1052+ // placed further apart in the locality hierarchy. Returns voterDiversityScore
1053+ // and replicaDiversityScore which are the average pairwise distance across all
1054+ // voter-voter pairs and replica pairs respectively.
10491055func diversityScore (
10501056 replicas [numReplicaKinds ][]storeAndLocality ,
10511057) (voterDiversityScore , replicaDiversityScore float64 ) {
1052- // Helper to calculate average, or a max-value if no samples (represents lowest possible diversity)
1058+ // Helper to calculate average, or a max-value if no samples (represents
1059+ // lowest possible diversity).
10531060 scoreFromSumAndSamples := func (sumScore float64 , numSamples int ) float64 {
10541061 if numSamples == 0 {
10551062 return roachpb .MaxDiversityScore
@@ -1071,9 +1078,10 @@ func diversityScore(
10711078 return scoreFromSumAndSamples (voterSum , voterSamples ), scoreFromSumAndSamples (totalSum , totalSamples )
10721079}
10731080
1074- // finishInit analyzes the span config and the range’s current replica set. It
1075- // prepares the MMA to compute lease-transfer and rebalancing candidates while
1076- // satisfying both constraint requirements and leaseholder preferences.
1081+ // finishInit completes initialization for the rangeAnalyzedConstraints, It
1082+ // initializes the constraints and voterConstraints, determines the leaseholder
1083+ // preference index for all voter replicas, builds locality tier information,
1084+ // and computes diversity scores.
10771085func (rac * rangeAnalyzedConstraints ) finishInit (
10781086 spanConfig * normalizedSpanConfig ,
10791087 constraintMatcher storeMatchesConstraintInterface ,
@@ -1479,6 +1487,9 @@ type analyzeConstraintsBuf struct {
14791487 // constraints. We still populate both voter and non-voter indexes because
14801488 // we need to know which constraints are satisfied by all replicas to
14811489 // determine when promotions or demotions should occur.
1490+ //
1491+ // NB: expected to be empty at the start of every
1492+ // analyzedConstraints.initialize call.
14821493 replicaConstraintIndices [numReplicaKinds ][][]int32
14831494}
14841495
0 commit comments