Skip to content

Commit a6609c3

Browse files
committed
Optimize repairing connections (#416)
* first attempt to reduce computation for deleting * more optimizations * refactors `removeExtraLinks` for less data moving * remove `removeExtraLinks` * remove unused code
1 parent deb498c commit a6609c3

File tree

2 files changed

+84
-90
lines changed

2 files changed

+84
-90
lines changed

src/VecSim/algorithms/hnsw/hnsw.h

Lines changed: 84 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,20 @@ struct LevelData {
9191

9292
LevelData(std::shared_ptr<VecSimAllocator> allocator)
9393
: incomingEdges(new (allocator) vecsim_stl::vector<idType>(allocator)), numLinks(0) {}
94+
95+
// Sets the outgoing links of the current element.
96+
// Assumes that the object has the capacity to hold all the links.
97+
inline void setLinks(vecsim_stl::vector<idType> &links) {
98+
numLinks = links.size();
99+
memcpy(this->links, links.data(), numLinks * sizeof(idType));
100+
}
101+
template <typename DistType>
102+
inline void setLinks(candidatesList<DistType> &links) {
103+
numLinks = 0;
104+
for (auto &link : links) {
105+
this->links[numLinks++] = link.second;
106+
}
107+
}
94108
};
95109

96110
struct ElementGraphData {
@@ -171,9 +185,6 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
171185
HNSWIndex() = delete; // default constructor is disabled.
172186
HNSWIndex(const HNSWIndex &) = delete; // default (shallow) copy constructor is disabled.
173187
inline size_t getRandomLevel(double reverse_size);
174-
inline void removeExtraLinks(candidatesList<DistType> candidates, size_t Mcurmax,
175-
LevelData &node_level, const vecsim_stl::vector<bool> &bitmap,
176-
idType *removed_links, size_t *removed_links_num);
177188
template <bool has_marked_deleted, typename Identifier> // Either idType or labelType
178189
inline void
179190
processCandidate(idType curNodeId, const void *data_point, size_t layer, size_t ef,
@@ -532,32 +543,6 @@ void HNSWIndex<DataType, DistType>::unlockNodeLinks(idType node_id) const {
532543
/**
533544
* helper functions
534545
*/
535-
template <typename DataType, typename DistType>
536-
void HNSWIndex<DataType, DistType>::removeExtraLinks(
537-
candidatesList<DistType> candidates, size_t Mcurmax, LevelData &node_level,
538-
const vecsim_stl::vector<bool> &neighbors_bitmap, idType *removed_links,
539-
size_t *removed_links_num) {
540-
541-
// candidates will store the newly selected neighbours (for the relevant node).
542-
vecsim_stl::vector<idType> not_selected_candidates(this->allocator);
543-
getNeighborsByHeuristic2(candidates, Mcurmax, not_selected_candidates);
544-
545-
// update the node level data.
546-
node_level.numLinks = 0;
547-
for (auto &selected_neighbor : candidates) {
548-
node_level.links[node_level.numLinks++] = selected_neighbor.second;
549-
}
550-
551-
// save the neighbours that were chosen to be removed.
552-
size_t removed_idx = 0;
553-
for (auto &not_selected_neighbor : not_selected_candidates) {
554-
if (neighbors_bitmap[not_selected_neighbor]) {
555-
removed_links[removed_idx++] = not_selected_neighbor;
556-
}
557-
}
558-
*removed_links_num = removed_idx;
559-
}
560-
561546
template <typename DataType, typename DistType>
562547
void HNSWIndex<DataType, DistType>::emplaceToHeap(
563548
vecsim_stl::abstract_priority_queue<DistType, idType> &heap, DistType dist, idType id) const {
@@ -1024,56 +1009,61 @@ void HNSWIndex<DataType, DistType>::repairConnectionsForDeletion(
10241009
LevelData &neighbor_level, size_t level, vecsim_stl::vector<bool> &neighbours_bitmap) {
10251010

10261011
// put the deleted element's neighbours in the candidates.
1027-
candidatesList<DistType> candidates(this->allocator);
1028-
candidates.reserve(node_level.numLinks + neighbor_level.numLinks);
1029-
auto neighbours_data = getDataByInternalId(neighbour_id);
1012+
vecsim_stl::vector<idType> candidate_ids(this->allocator);
1013+
candidate_ids.reserve(node_level.numLinks + neighbor_level.numLinks);
10301014
for (size_t j = 0; j < node_level.numLinks; j++) {
10311015
// Don't put the neighbor itself in his own candidates
1032-
if (node_level.links[j] == neighbour_id) {
1033-
continue;
1016+
if (node_level.links[j] != neighbour_id) {
1017+
candidate_ids.push_back(node_level.links[j]);
10341018
}
1035-
candidates.emplace_back(
1036-
this->distFunc(getDataByInternalId(node_level.links[j]), neighbours_data, this->dim),
1037-
node_level.links[j]);
10381019
}
1039-
10401020
// add the deleted element's neighbour's original neighbors in the candidates.
10411021
vecsim_stl::vector<bool> neighbour_orig_neighbours_set(curElementCount, false, this->allocator);
1042-
10431022
for (size_t j = 0; j < neighbor_level.numLinks; j++) {
10441023
neighbour_orig_neighbours_set[neighbor_level.links[j]] = true;
10451024
// Don't add the removed element to the candidates, nor nodes that are already in the
10461025
// candidates set.
1047-
if (neighbours_bitmap[neighbor_level.links[j]] ||
1048-
neighbor_level.links[j] == element_internal_id) {
1049-
continue;
1026+
if (neighbor_level.links[j] != element_internal_id &&
1027+
!neighbours_bitmap[neighbor_level.links[j]]) {
1028+
candidate_ids.push_back(neighbor_level.links[j]);
10501029
}
1051-
candidates.emplace_back(this->distFunc(neighbours_data,
1052-
getDataByInternalId(neighbor_level.links[j]),
1053-
this->dim),
1054-
neighbor_level.links[j]);
10551030
}
10561031

10571032
size_t Mcurmax = level ? M : M0;
1058-
size_t removed_links_num;
1059-
idType removed_links[neighbor_level.numLinks];
1060-
removeExtraLinks(candidates, Mcurmax, neighbor_level, neighbour_orig_neighbours_set,
1061-
removed_links, &removed_links_num);
1062-
1063-
// remove neighbour id from the incoming list of nodes for his
1064-
// neighbours that were chosen to remove
1065-
for (size_t i = 0; i < removed_links_num; i++) {
1066-
idType node_id = removed_links[i];
1067-
LevelData &node_level = getLevelData(node_id, level);
1033+
if (candidate_ids.size() > Mcurmax) {
1034+
// We need to filter the candidates by the heuristic.
1035+
candidatesList<DistType> candidates(this->allocator);
1036+
candidates.reserve(candidate_ids.size());
1037+
auto neighbours_data = getDataByInternalId(neighbour_id);
1038+
for (auto candidate_id : candidate_ids) {
1039+
candidates.emplace_back(
1040+
this->distFunc(getDataByInternalId(candidate_id), neighbours_data, this->dim),
1041+
candidate_id);
1042+
}
10681043

1069-
// if the node id (the neighbour's neighbour to be removed)
1070-
// wasn't pointing to the neighbour (edge was one directional),
1071-
// we should remove it from the node's incoming edges.
1072-
// otherwise, edge turned from bidirectional to one directional,
1073-
// and it should be saved in the neighbor's incoming edges.
1074-
if (!removeIdFromList(*node_level.incomingEdges, neighbour_id)) {
1075-
neighbor_level.incomingEdges->push_back(node_id);
1044+
candidate_ids.clear();
1045+
auto &not_chosen_candidates = candidate_ids; // rename and reuse the vector
1046+
getNeighborsByHeuristic2(candidates, Mcurmax, not_chosen_candidates);
1047+
1048+
neighbor_level.setLinks(candidates);
1049+
1050+
// remove neighbour id from the incoming list of nodes for his
1051+
// neighbours that were chosen to remove
1052+
for (auto node_id : not_chosen_candidates) {
1053+
if (neighbour_orig_neighbours_set[node_id]) {
1054+
// if the node id (the neighbour's neighbour to be removed)
1055+
// wasn't pointing to the neighbour (edge was one directional),
1056+
// we should remove it from the node's incoming edges.
1057+
// otherwise, edge turned from bidirectional to one directional,
1058+
// and it should be saved in the neighbor's incoming edges.
1059+
if (!removeIdFromList(*getLevelData(node_id, level).incomingEdges, neighbour_id)) {
1060+
neighbor_level.incomingEdges->push_back(node_id);
1061+
}
1062+
}
10761063
}
1064+
} else {
1065+
// We don't need to filter the candidates - just update the edges.
1066+
neighbor_level.setLinks(candidate_ids);
10771067
}
10781068

10791069
// updates for the new edges created
@@ -1499,7 +1489,7 @@ void HNSWIndex<DataType, DistType>::mutuallyUpdateForRepairedNode(
14991489
template <typename DataType, typename DistType>
15001490
void HNSWIndex<DataType, DistType>::repairNodeConnections(idType node_id, size_t level) {
15011491

1502-
candidatesList<DistType> neighbors_candidates(this->allocator);
1492+
vecsim_stl::vector<idType> neighbors_candidate_ids(this->allocator);
15031493
// Use bitmaps for fast accesses:
15041494
// node_orig_neighbours_set is used to differentiate between the neighbors that will *not* be
15051495
// selected by the heuristics - only the ones that were originally neighbors should be removed.
@@ -1512,7 +1502,6 @@ void HNSWIndex<DataType, DistType>::repairNodeConnections(idType node_id, size_t
15121502

15131503
// Go over the repaired node neighbors, collect the non-deleted ones to be neighbors candidates
15141504
// after the repair as well.
1515-
const void *node_data = getDataByInternalId(node_id);
15161505
auto *element = getGraphDataByInternalId(node_id);
15171506
lockNodeLinks(element);
15181507
LevelData &node_level_data = getLevelData(element, level);
@@ -1524,9 +1513,7 @@ void HNSWIndex<DataType, DistType>::repairNodeConnections(idType node_id, size_t
15241513
continue;
15251514
}
15261515
neighbors_candidates_set[node_level_data.links[j]] = true;
1527-
neighbors_candidates.emplace_back(
1528-
this->distFunc(node_data, getDataByInternalId(node_level_data.links[j]), this->dim),
1529-
node_level_data.links[j]);
1516+
neighbors_candidate_ids.push_back(node_level_data.links[j]);
15301517
}
15311518
unlockNodeLinks(element);
15321519

@@ -1561,28 +1548,43 @@ void HNSWIndex<DataType, DistType>::repairNodeConnections(idType node_id, size_t
15611548
continue;
15621549
}
15631550
neighbors_candidates_set[neighbor_level_data.links[j]] = true;
1564-
neighbors_candidates.emplace_back(
1565-
this->distFunc(node_data, getDataByInternalId(neighbor_level_data.links[j]),
1566-
this->dim),
1567-
neighbor_level_data.links[j]);
1551+
neighbors_candidate_ids.push_back(neighbor_level_data.links[j]);
15681552
}
15691553
unlockNodeLinks(neighbor);
15701554
}
15711555

15721556
size_t max_M_cur = level ? M : M0;
1573-
vecsim_stl::vector<idType> not_chosen_neighbors(this->allocator);
1574-
getNeighborsByHeuristic2(neighbors_candidates, max_M_cur, not_chosen_neighbors);
1557+
if (neighbors_candidate_ids.size() > max_M_cur) {
1558+
// We have more candidates than the maximum number of neighbors, so we need to select which
1559+
// ones to keep. We use the heuristic to select the neighbors, and then remove the ones that
1560+
// were not originally neighbors.
1561+
candidatesList<DistType> neighbors_candidates(this->allocator);
1562+
neighbors_candidates.reserve(neighbors_candidate_ids.size());
1563+
const void *node_data = getDataByInternalId(node_id);
1564+
for (idType candidate : neighbors_candidate_ids) {
1565+
neighbors_candidates.emplace_back(
1566+
this->distFunc(getDataByInternalId(candidate), node_data, this->dim), candidate);
1567+
}
1568+
vecsim_stl::vector<idType> not_chosen_neighbors(this->allocator);
1569+
getNeighborsByHeuristic2(neighbors_candidates, max_M_cur, not_chosen_neighbors);
15751570

1576-
for (idType not_chosen_neighbor : not_chosen_neighbors) {
1577-
if (node_orig_neighbours_set[not_chosen_neighbor]) {
1578-
neighbors_to_remove.push_back(not_chosen_neighbor);
1579-
nodes_to_update.push_back(not_chosen_neighbor);
1571+
for (idType not_chosen_neighbor : not_chosen_neighbors) {
1572+
if (node_orig_neighbours_set[not_chosen_neighbor]) {
1573+
neighbors_to_remove.push_back(not_chosen_neighbor);
1574+
nodes_to_update.push_back(not_chosen_neighbor);
1575+
}
15801576
}
1581-
}
15821577

1583-
for (auto &neighbor : neighbors_candidates) {
1584-
chosen_neighbors.push_back(neighbor.second);
1585-
nodes_to_update.push_back(neighbor.second);
1578+
for (auto &neighbor : neighbors_candidates) {
1579+
chosen_neighbors.push_back(neighbor.second);
1580+
nodes_to_update.push_back(neighbor.second);
1581+
}
1582+
} else {
1583+
// We have less candidates than the maximum number of neighbors, so we choose them all, and
1584+
// extend the nodes to update with them.
1585+
chosen_neighbors.swap(neighbors_candidate_ids);
1586+
nodes_to_update.insert(nodes_to_update.end(), chosen_neighbors.begin(),
1587+
chosen_neighbors.end());
15861588
}
15871589

15881590
// Perform the actual updates for the node and the impacted neighbors while holding the nodes'

src/VecSim/utils/vec_utils.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,6 @@
1313
#include <cassert>
1414
#include <cmath> //sqrt
1515

16-
template <typename dist_t>
17-
struct CompareByFirst {
18-
constexpr bool operator()(std::pair<dist_t, unsigned int> const &a,
19-
std::pair<dist_t, unsigned int> const &b) const noexcept {
20-
return (a.first != b.first) ? a.first < b.first : a.second < b.second;
21-
}
22-
};
23-
2416
struct VecSimCommonStrings {
2517
public:
2618
static const char *ALGORITHM_STRING;

0 commit comments

Comments
 (0)