Skip to content

Commit 638397b

Browse files
committed
Priority queue iterator - [MOD-5529] (#406)
* changed implementation of max heap to have random order iterator * changed heuristic use to use heap iterator * fix sort * bug fix (flip flag) * reserving and using ranges * removing a no-longer relevant test * making heap iterator const * remove the usage of ranges (TODO: return in the future) * review fixes
1 parent 3f6360b commit 638397b

File tree

4 files changed

+131
-259
lines changed

4 files changed

+131
-259
lines changed

src/VecSim/algorithms/hnsw/hnsw.h

Lines changed: 116 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ typedef uint8_t elementFlags;
4545
template <typename DistType>
4646
using candidatesMaxHeap = vecsim_stl::max_priority_queue<DistType, idType>;
4747
template <typename DistType>
48+
using candidatesList = vecsim_stl::vector<pair<DistType, idType>>;
49+
template <typename DistType>
4850
using candidatesLabelsMaxHeap = vecsim_stl::abstract_priority_queue<DistType, labelType>;
4951
using graphNodeType = pair<idType, ushort>; // represented as: (element_id, level)
5052

@@ -170,7 +172,7 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
170172
HNSWIndex() = delete; // default constructor is disabled.
171173
HNSWIndex(const HNSWIndex &) = delete; // default (shallow) copy constructor is disabled.
172174
inline size_t getRandomLevel(double reverse_size);
173-
inline void removeExtraLinks(candidatesMaxHeap<DistType> candidates, size_t Mcurmax,
175+
inline void removeExtraLinks(candidatesList<DistType> candidates, size_t Mcurmax,
174176
LevelData &node_level, const vecsim_stl::vector<bool> &bitmap,
175177
idType *removed_links, size_t *removed_links_num);
176178
template <bool has_marked_deleted, typename Identifier> // Either idType or labelType
@@ -197,7 +199,13 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
197199
double epsilon, DistType radius,
198200
void *timeoutCtx,
199201
VecSimQueryResult_Code *rc) const;
200-
void getNeighborsByHeuristic2(candidatesMaxHeap<DistType> &top_candidates, size_t M);
202+
idType getNeighborsByHeuristic2(candidatesList<DistType> &top_candidates, size_t M) const;
203+
void getNeighborsByHeuristic2(candidatesList<DistType> &top_candidates, size_t M,
204+
vecsim_stl::vector<idType> &not_chosen_candidates) const;
205+
template <bool record_removed>
206+
void getNeighborsByHeuristic2_internal(
207+
candidatesList<DistType> &top_candidates, size_t M,
208+
vecsim_stl::vector<idType> *removed_candidates = nullptr) const;
201209
// Helper function for re-selecting node's neighbors which was selected as a neighbor for
202210
// a newly inserted node. Also, responsible for mutually connect the new node and the neighbor
203211
// (unidirectional or bidirectional connection).
@@ -527,32 +535,27 @@ void HNSWIndex<DataType, DistType>::unlockNodeLinks(idType node_id) const {
527535
*/
528536
template <typename DataType, typename DistType>
529537
void HNSWIndex<DataType, DistType>::removeExtraLinks(
530-
candidatesMaxHeap<DistType> candidates, size_t Mcurmax, LevelData &node_level,
538+
candidatesList<DistType> candidates, size_t Mcurmax, LevelData &node_level,
531539
const vecsim_stl::vector<bool> &neighbors_bitmap, idType *removed_links,
532540
size_t *removed_links_num) {
533541

534-
auto orig_candidates = candidates;
535542
// candidates will store the newly selected neighbours (for the relevant node).
536-
getNeighborsByHeuristic2(candidates, Mcurmax);
543+
vecsim_stl::vector<idType> not_selected_candidates(this->allocator);
544+
getNeighborsByHeuristic2(candidates, Mcurmax, not_selected_candidates);
537545

538-
// check the diff in the link list, save the neighbours
539-
// that were chosen to be removed, and update the new neighbours
540-
size_t removed_idx = 0;
541-
size_t link_idx = 0;
546+
// update the node level data.
547+
node_level.numLinks = 0;
548+
for (auto &selected_neighbor : candidates) {
549+
node_level.links[node_level.numLinks++] = selected_neighbor.second;
550+
}
542551

543-
while (orig_candidates.size() > 0) {
544-
if (orig_candidates.top().second != candidates.top().second) {
545-
if (neighbors_bitmap[orig_candidates.top().second]) {
546-
removed_links[removed_idx++] = orig_candidates.top().second;
547-
}
548-
orig_candidates.pop();
549-
} else {
550-
node_level.links[link_idx++] = candidates.top().second;
551-
candidates.pop();
552-
orig_candidates.pop();
552+
// save the neighbours that were chosen to be removed.
553+
size_t removed_idx = 0;
554+
for (auto &not_selected_neighbor : not_selected_candidates) {
555+
if (neighbors_bitmap[not_selected_neighbor]) {
556+
removed_links[removed_idx++] = not_selected_neighbor;
553557
}
554558
}
555-
node_level.numLinks = link_idx;
556559
*removed_links_num = removed_idx;
557560
}
558561

@@ -764,31 +767,53 @@ HNSWIndex<DataType, DistType>::searchLayer(idType ep_id, const void *data_point,
764767
return top_candidates;
765768
}
766769

770+
template <typename DataType, typename DistType>
771+
idType
772+
HNSWIndex<DataType, DistType>::getNeighborsByHeuristic2(candidatesList<DistType> &top_candidates,
773+
const size_t M) const {
774+
if (top_candidates.size() < M) {
775+
return std::min_element(top_candidates.begin(), top_candidates.end(),
776+
[](const auto &a, const auto &b) { return a.first < b.first; })
777+
->second;
778+
} else {
779+
getNeighborsByHeuristic2_internal<false>(top_candidates, M, nullptr);
780+
return top_candidates.front().second;
781+
}
782+
}
783+
767784
template <typename DataType, typename DistType>
768785
void HNSWIndex<DataType, DistType>::getNeighborsByHeuristic2(
769-
candidatesMaxHeap<DistType> &top_candidates, const size_t M) {
786+
candidatesList<DistType> &top_candidates, const size_t M,
787+
vecsim_stl::vector<idType> &removed_candidates) const {
788+
getNeighborsByHeuristic2_internal<true>(top_candidates, M, &removed_candidates);
789+
}
790+
791+
template <typename DataType, typename DistType>
792+
template <bool record_removed>
793+
void HNSWIndex<DataType, DistType>::getNeighborsByHeuristic2_internal(
794+
candidatesList<DistType> &top_candidates, const size_t M,
795+
vecsim_stl::vector<idType> *removed_candidates) const {
770796
if (top_candidates.size() < M) {
771797
return;
772798
}
773799

774-
candidatesMaxHeap<DistType> queue_closest(this->allocator);
775-
vecsim_stl::vector<pair<DistType, idType>> return_list(this->allocator);
800+
candidatesList<DistType> return_list(this->allocator);
776801
vecsim_stl::vector<const void *> cached_vectors(this->allocator);
777802
return_list.reserve(M);
778803
cached_vectors.reserve(M);
779-
while (top_candidates.size() > 0) {
780-
// the distance is saved negatively to have the queue ordered such that first is closer
781-
// (higher).
782-
queue_closest.emplace(-top_candidates.top().first, top_candidates.top().second);
783-
top_candidates.pop();
804+
if constexpr (record_removed) {
805+
removed_candidates->reserve(top_candidates.size());
784806
}
785807

786-
while (queue_closest.size() && return_list.size() < M) {
787-
pair<DistType, idType> current_pair = queue_closest.top();
788-
DistType candidate_to_query_dist = -current_pair.first;
789-
queue_closest.pop();
808+
// Sort the candidates by their distance (we don't mind the secondary order (the internal id))
809+
std::sort(top_candidates.begin(), top_candidates.end(),
810+
[](const auto &a, const auto &b) { return a.first < b.first; });
811+
812+
auto current_pair = top_candidates.begin();
813+
for (; current_pair != top_candidates.end() && return_list.size() < M; ++current_pair) {
814+
DistType candidate_to_query_dist = current_pair->first;
790815
bool good = true;
791-
const void *curr_vector = getDataByInternalId(current_pair.second);
816+
const void *curr_vector = getDataByInternalId(current_pair->second);
792817

793818
// a candidate is "good" to become a neighbour, unless we find
794819
// another item that was already selected to the neighbours set which is closer
@@ -797,19 +822,26 @@ void HNSWIndex<DataType, DistType>::getNeighborsByHeuristic2(
797822
DistType candidate_to_selected_dist =
798823
this->distFunc(cached_vectors[i], curr_vector, this->dim);
799824
if (candidate_to_selected_dist < candidate_to_query_dist) {
825+
if constexpr (record_removed) {
826+
removed_candidates->push_back(current_pair->second);
827+
}
800828
good = false;
801829
break;
802830
}
803831
}
804832
if (good) {
805833
cached_vectors.push_back(curr_vector);
806-
return_list.push_back(current_pair);
834+
return_list.push_back(*current_pair);
807835
}
808836
}
809837

810-
for (pair<DistType, idType> current_pair : return_list) {
811-
top_candidates.emplace(-current_pair.first, current_pair.second);
838+
if constexpr (record_removed) {
839+
for (; current_pair != top_candidates.end(); ++current_pair) {
840+
removed_candidates->push_back(current_pair->second);
841+
}
812842
}
843+
844+
top_candidates.swap(return_list);
813845
}
814846

815847
template <typename DataType, typename DistType>
@@ -819,57 +851,41 @@ void HNSWIndex<DataType, DistType>::revisitNeighborConnections(
819851
// Note - expect that node_lock and neighbor_lock are locked at that point.
820852

821853
// Collect the existing neighbors and the new node as the neighbor's neighbors candidates.
822-
candidatesMaxHeap<DistType> candidates(this->allocator);
854+
candidatesList<DistType> candidates(this->allocator);
855+
candidates.reserve(neighbor_level.numLinks + 1);
823856
// Add the new node along with the pre-calculated distance to the current neighbor,
824-
candidates.emplace(neighbor_data.first, new_node_id);
857+
candidates.emplace_back(neighbor_data.first, new_node_id);
825858

826859
idType selected_neighbor = neighbor_data.second;
827860
const void *selected_neighbor_data = getDataByInternalId(selected_neighbor);
828861
for (size_t j = 0; j < neighbor_level.numLinks; j++) {
829-
candidates.emplace(this->distFunc(getDataByInternalId(neighbor_level.links[j]),
830-
selected_neighbor_data, this->dim),
831-
neighbor_level.links[j]);
862+
candidates.emplace_back(this->distFunc(getDataByInternalId(neighbor_level.links[j]),
863+
selected_neighbor_data, this->dim),
864+
neighbor_level.links[j]);
832865
}
833866

834-
std::vector<idType> nodes_to_update;
835-
auto orig_candidates = candidates;
836-
837867
// Candidates will store the newly selected neighbours (for the neighbor).
838868
size_t max_M_cur = level ? M : M0;
839-
getNeighborsByHeuristic2(candidates, max_M_cur);
840-
841-
// Go over the original candidates set, and save the ones chosen to be removed to update later
842-
// on.
843-
bool cur_node_chosen = false;
844-
while (orig_candidates.size() > 0) {
845-
idType orig_candidate = orig_candidates.top().second;
846-
// If the current original candidate was not selected as neighbor by the heuristics, it
847-
// should be updated and removed from the neighbor's neighbors.
848-
if (candidates.empty() || orig_candidate != candidates.top().second) {
849-
// Don't add the new_node_id to nodes_to_update, it will be inserted either way later.
850-
if (orig_candidate != new_node_id) {
851-
nodes_to_update.push_back(orig_candidate);
852-
}
853-
orig_candidates.pop();
854-
// Otherwise, the original candidate was selected to remain a neighbor - no need to
855-
// update.
856-
} else {
857-
candidates.pop();
858-
orig_candidates.pop();
859-
if (orig_candidate == new_node_id) {
860-
cur_node_chosen = true;
861-
}
862-
}
863-
}
869+
vecsim_stl::vector<idType> nodes_to_update(this->allocator);
870+
getNeighborsByHeuristic2(candidates, max_M_cur, nodes_to_update);
864871

865872
// Acquire all relevant locks for making the updates for the selected neighbor - all its removed
866873
// neighbors, along with the neighbors itself and the cur node.
867874
// but first, we release the node and neighbors lock to avoid deadlocks.
868875
unlockNodeLinks(new_node_id);
869876
unlockNodeLinks(selected_neighbor);
870877

878+
// Check if the new node was selected as a neighbor for the current neighbor.
879+
// Make sure to add the cur node to the list of nodes to update if it was selected.
880+
bool cur_node_chosen;
881+
auto new_node_iter = std::find(nodes_to_update.begin(), nodes_to_update.end(), new_node_id);
882+
if (new_node_iter != nodes_to_update.end()) {
883+
cur_node_chosen = false;
884+
} else {
885+
cur_node_chosen = true;
886+
nodes_to_update.push_back(new_node_id);
887+
}
871888
nodes_to_update.push_back(selected_neighbor);
872-
nodes_to_update.push_back(new_node_id);
873889

874890
std::sort(nodes_to_update.begin(), nodes_to_update.end());
875891
size_t nodes_to_update_count = nodes_to_update.size();
@@ -935,27 +951,21 @@ idType HNSWIndex<DataType, DistType>::mutuallyConnectNewElement(
935951
size_t max_M_cur = level ? M : M0;
936952

937953
// Filter the top candidates to the selected neighbors by the algorithm heuristics.
938-
getNeighborsByHeuristic2(top_candidates, M);
939-
assert(top_candidates.size() <= M &&
954+
// First, we need to copy the top candidates to a vector.
955+
candidatesList<DistType> top_candidates_list(this->allocator);
956+
top_candidates_list.insert(top_candidates_list.end(), top_candidates.begin(),
957+
top_candidates.end());
958+
// Use the heuristic to filter the top candidates, and get the next closest entry point.
959+
idType next_closest_entry_point = getNeighborsByHeuristic2(top_candidates_list, M);
960+
assert(top_candidates_list.size() <= M &&
940961
"Should be not be more than M candidates returned by the heuristic");
941962

942-
// Hold (distance_from_new_node_id, neighbor_id) pair for every selected neighbor.
943-
vecsim_stl::vector<std::pair<DistType, idType>> selected_neighbors(this->allocator);
944-
selected_neighbors.reserve(M);
945-
while (!top_candidates.empty()) {
946-
selected_neighbors.push_back(top_candidates.top());
947-
top_candidates.pop();
948-
}
949-
950-
// The closest vector that has found to be returned (and start the scan from it in the next
951-
// level).
952-
idType next_closest_entry_point = selected_neighbors.back().second;
953963
auto *new_node_level = getGraphDataByInternalId(new_node_id);
954964
LevelData &new_node_level_data = getLevelData(new_node_level, level);
955965
assert(new_node_level_data.numLinks == 0 &&
956966
"The newly inserted element should have blank link list");
957967

958-
for (auto &neighbor_data : selected_neighbors) {
968+
for (auto &neighbor_data : top_candidates_list) {
959969
idType selected_neighbor = neighbor_data.second; // neighbor's id
960970
auto *neighbor_graph_data = getGraphDataByInternalId(selected_neighbor);
961971
if (new_node_id < selected_neighbor) {
@@ -1015,14 +1025,15 @@ void HNSWIndex<DataType, DistType>::repairConnectionsForDeletion(
10151025
LevelData &neighbor_level, size_t level, vecsim_stl::vector<bool> &neighbours_bitmap) {
10161026

10171027
// put the deleted element's neighbours in the candidates.
1018-
candidatesMaxHeap<DistType> candidates(this->allocator);
1028+
candidatesList<DistType> candidates(this->allocator);
1029+
candidates.reserve(node_level.numLinks + neighbor_level.numLinks);
10191030
auto neighbours_data = getDataByInternalId(neighbour_id);
10201031
for (size_t j = 0; j < node_level.numLinks; j++) {
10211032
// Don't put the neighbor itself in his own candidates
10221033
if (node_level.links[j] == neighbour_id) {
10231034
continue;
10241035
}
1025-
candidates.emplace(
1036+
candidates.emplace_back(
10261037
this->distFunc(getDataByInternalId(node_level.links[j]), neighbours_data, this->dim),
10271038
node_level.links[j]);
10281039
}
@@ -1038,9 +1049,10 @@ void HNSWIndex<DataType, DistType>::repairConnectionsForDeletion(
10381049
neighbor_level.links[j] == element_internal_id) {
10391050
continue;
10401051
}
1041-
candidates.emplace(this->distFunc(neighbours_data,
1042-
getDataByInternalId(neighbor_level.links[j]), this->dim),
1043-
neighbor_level.links[j]);
1052+
candidates.emplace_back(this->distFunc(neighbours_data,
1053+
getDataByInternalId(neighbor_level.links[j]),
1054+
this->dim),
1055+
neighbor_level.links[j]);
10441056
}
10451057

10461058
size_t Mcurmax = level ? M : M0;
@@ -1488,7 +1500,7 @@ void HNSWIndex<DataType, DistType>::mutuallyUpdateForRepairedNode(
14881500
template <typename DataType, typename DistType>
14891501
void HNSWIndex<DataType, DistType>::repairNodeConnections(idType node_id, size_t level) {
14901502

1491-
candidatesMaxHeap<DistType> neighbors_candidates(this->allocator);
1503+
candidatesList<DistType> neighbors_candidates(this->allocator);
14921504
// Use bitmaps for fast accesses:
14931505
// node_orig_neighbours_set is used to differentiate between the neighbors that will *not* be
14941506
// selected by the heuristics - only the ones that were originally neighbors should be removed.
@@ -1513,7 +1525,7 @@ void HNSWIndex<DataType, DistType>::repairNodeConnections(idType node_id, size_t
15131525
continue;
15141526
}
15151527
neighbors_candidates_set[node_level_data.links[j]] = true;
1516-
neighbors_candidates.emplace(
1528+
neighbors_candidates.emplace_back(
15171529
this->distFunc(node_data, getDataByInternalId(node_level_data.links[j]), this->dim),
15181530
node_level_data.links[j]);
15191531
}
@@ -1550,37 +1562,30 @@ void HNSWIndex<DataType, DistType>::repairNodeConnections(idType node_id, size_t
15501562
continue;
15511563
}
15521564
neighbors_candidates_set[neighbor_level_data.links[j]] = true;
1553-
neighbors_candidates.emplace(
1565+
neighbors_candidates.emplace_back(
15541566
this->distFunc(node_data, getDataByInternalId(neighbor_level_data.links[j]),
15551567
this->dim),
15561568
neighbor_level_data.links[j]);
15571569
}
15581570
unlockNodeLinks(neighbor);
15591571
}
15601572

1561-
// Copy the original candidates, and run the heuristics. Afterwards, neighbors_candidates will
1562-
// store the newly selected neighbours (for the node), while candidates which were originally
1563-
// neighbors and are not going to be selected, are going to be removed.
1564-
auto orig_candidates = neighbors_candidates;
15651573
size_t max_M_cur = level ? M : M0;
1566-
getNeighborsByHeuristic2(neighbors_candidates, max_M_cur);
1567-
1568-
while (!orig_candidates.empty()) {
1569-
idType orig_candidate = orig_candidates.top().second;
1570-
if (neighbors_candidates.empty() || orig_candidate != neighbors_candidates.top().second) {
1571-
if (node_orig_neighbours_set[orig_candidate]) {
1572-
neighbors_to_remove.push_back(orig_candidate);
1573-
nodes_to_update.push_back(orig_candidate);
1574-
}
1575-
orig_candidates.pop();
1576-
} else {
1577-
chosen_neighbors.push_back(orig_candidate);
1578-
nodes_to_update.push_back(orig_candidate);
1579-
neighbors_candidates.pop();
1580-
orig_candidates.pop();
1574+
vecsim_stl::vector<idType> not_chosen_neighbors(this->allocator);
1575+
getNeighborsByHeuristic2(neighbors_candidates, max_M_cur, not_chosen_neighbors);
1576+
1577+
for (idType not_chosen_neighbor : not_chosen_neighbors) {
1578+
if (node_orig_neighbours_set[not_chosen_neighbor]) {
1579+
neighbors_to_remove.push_back(not_chosen_neighbor);
1580+
nodes_to_update.push_back(not_chosen_neighbor);
15811581
}
15821582
}
15831583

1584+
for (auto &neighbor : neighbors_candidates) {
1585+
chosen_neighbors.push_back(neighbor.second);
1586+
nodes_to_update.push_back(neighbor.second);
1587+
}
1588+
15841589
// Perform the actual updates for the node and the impacted neighbors while holding the nodes'
15851590
// locks.
15861591
mutuallyUpdateForRepairedNode(node_id, level, neighbors_to_remove, nodes_to_update,

src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,4 @@ INDEX_TEST_FRIEND_CLASS(HNSWTestParallel_parallelSearchKnn_Test)
2020
INDEX_TEST_FRIEND_CLASS(HNSWTestParallel_parallelSearchCombined_Test)
2121
INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteFromHNSWMultiLevels_Test)
2222
INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteFromHNSWWithRepairJobExec_Test)
23-
INDEX_TEST_FRIEND_CLASS(HNSWTest_testIncomingEdgesSize_Test)
2423
INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic_Test)

0 commit comments

Comments
 (0)