Skip to content

Commit d784136

Browse files
authored
Deletion multi bug fix (#346)
1 parent 01748ae commit d784136

File tree

6 files changed

+116
-2
lines changed

6 files changed

+116
-2
lines changed

src/VecSim/algorithms/brute_force/brute_force_multi.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,18 @@ template <typename DataType, typename DistType>
127127
void BruteForceIndex_Multi<DataType, DistType>::replaceIdOfLabel(labelType label, idType new_id,
128128
idType old_id) {
129129
assert(labelToIdsLookup.find(label) != labelToIdsLookup.end());
130+
// *Non-trivial code here* - in every iteration we replace the internal id of the previous last
131+
// id that has been swapped with the deleted id. Note that if the old and the new replaced ids
132+
// both belong to the same label, then we are going to delete the new id later on as well, since
133+
// we are currently iterating on this exact array of ids in 'deleteVector'. Hence, the relevant
134+
// part of the vector that should be updated is the "tail" that comes after the position of
135+
// old_id, while the "head" may contain old occurrences of old_id that are irrelevant for the
136+
// future deletions. Therefore, we iterate from end to beginning. For example, assuming we are
137+
// deleting a label that contains the only 3 ids that exist in the index. Hence, we would
138+
// expect the following scenario w.r.t. the ids array:
139+
// [|1, 0, 2] -> [1, |0, 1] -> [1, 0, |0] (where | marks the current position)
130140
auto &ids = labelToIdsLookup.at(label);
131-
for (size_t i = 0; i < ids.size(); i++) {
141+
for (int i = ids.size() - 1; i >= 0; i--) {
132142
if (ids[i] == old_id) {
133143
ids[i] = new_id;
134144
return;

src/VecSim/algorithms/brute_force/brute_force_multi_tests_friends.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_indexing_same_vector_Test)
1414
INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_test_delete_swap_block_Test)
1515
INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_test_dynamic_bf_info_iterator_Test)
1616
INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_remove_vector_after_replacing_block_Test)
17+
INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_removeVectorWithSwaps_Test)

src/VecSim/algorithms/hnsw/hnsw_multi.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,18 @@ template <typename DataType, typename DistType>
109109
void HNSWIndex_Multi<DataType, DistType>::replaceIdOfLabel(labelType label, idType new_id,
110110
idType old_id) {
111111
assert(label_lookup_.find(label) != label_lookup_.end());
112+
// *Non-trivial code here* - in every iteration we replace the internal id of the previous last
113+
// id that has been swapped with the deleted id. Note that if the old and the new replaced ids
114+
// both belong to the same label, then we are going to delete the new id later on as well, since
115+
// we are currently iterating on this exact array of ids in 'deleteVector'. Hence, the relevant
116+
// part of the vector that should be updated is the "tail" that comes after the position of
117+
// old_id, while the "head" may contain old occurrences of old_id that are irrelevant for the
118+
// future deletions. Therefore, we iterate from end to beginning. For example, assuming we are
119+
// deleting a label that contains the only 3 ids that exist in the index. Hence, we would
120+
// expect the following scenario w.r.t. the ids array:
121+
// [|1, 0, 2] -> [1, |0, 1] -> [1, 0, |0] (where | marks the current position)
112122
auto &ids = label_lookup_.at(label);
113-
for (size_t i = 0; i < ids.size(); i++) {
123+
for (int i = ids.size() - 1; i >= 0; i--) {
114124
if (ids[i] == old_id) {
115125
ids[i] = new_id;
116126
return;

src/VecSim/algorithms/hnsw/hnsw_multi_tests_friends.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_indexing_same_vector_Test)
1111
INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_test_dynamic_hnsw_info_iterator_Test)
1212
INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_preferAdHocOptimization_Test)
1313
INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_testSizeEstimation_Test)
14+
INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_removeVectorWithSwaps_Test)

tests/unit/test_bruteforce_multi.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,52 @@ TYPED_TEST(BruteForceMultiTest, search_empty_index) {
710710
VecSimIndex_Free(index);
711711
}
712712

713+
TYPED_TEST(BruteForceMultiTest, removeVectorWithSwaps) {
714+
size_t dim = 4;
715+
size_t n = 6;
716+
717+
BFParams params = {.dim = dim, .metric = VecSimMetric_L2};
718+
auto *index = this->CastToBF_Multi(this->CreateNewIndex(params));
719+
720+
// Insert 3 vectors under two different labels, so that we will have:
721+
// {first_label->[0,1,3], second_label->[2,4,5]}
722+
labelType first_label = 1;
723+
labelType second_label = 2;
724+
725+
GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
726+
GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
727+
GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
728+
GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
729+
GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
730+
GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
731+
ASSERT_EQ(VecSimIndex_IndexSize(index), n);
732+
733+
// Artificially reorder the internal ids to test that we make the right changes
734+
// when we have an id that appears twice in the array upon deleting the ids one by one.
735+
ASSERT_EQ(index->labelToIdsLookup.at(second_label).size(), n / 2);
736+
index->labelToIdsLookup.at(second_label)[0] = 4;
737+
index->labelToIdsLookup.at(second_label)[1] = 2;
738+
index->labelToIdsLookup.at(second_label)[2] = 5;
739+
740+
// Expect that the ids array of the second label will behave as following:
741+
// [|4, 2, 5] -> [4, |2, 4] -> [4, 2, |2] (where | marks the current position).
742+
index->deleteVector(second_label);
743+
ASSERT_EQ(index->indexLabelCount(), 1);
744+
ASSERT_EQ(VecSimIndex_IndexSize(index), n / 2);
745+
746+
// Check that the internal ids of the first label are as expected.
747+
auto ids = index->labelToIdsLookup.at(first_label);
748+
ASSERT_EQ(ids.size(), n / 2);
749+
ASSERT_TRUE(std::find(ids.begin(), ids.end(), 0) != ids.end());
750+
ASSERT_TRUE(std::find(ids.begin(), ids.end(), 1) != ids.end());
751+
ASSERT_TRUE(std::find(ids.begin(), ids.end(), 2) != ids.end());
752+
index->deleteVector(first_label);
753+
ASSERT_EQ(index->indexLabelCount(), 0);
754+
ASSERT_EQ(VecSimIndex_IndexSize(index), 0);
755+
756+
VecSimIndex_Free(index);
757+
}
758+
713759
TYPED_TEST(BruteForceMultiTest, remove_vector_after_replacing_block) {
714760
size_t dim = 4;
715761
size_t bs = 2;

tests/unit/test_hnsw_multi.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,52 @@ TYPED_TEST(HNSWMultiTest, search_empty_index) {
682682
VecSimIndex_Free(index);
683683
}
684684

685+
TYPED_TEST(HNSWMultiTest, removeVectorWithSwaps) {
686+
size_t dim = 4;
687+
size_t n = 6;
688+
689+
HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2};
690+
auto *index = this->CastToHNSW_Multi(this->CreateNewIndex(params));
691+
692+
// Insert 3 vectors under two different labels, so that we will have:
693+
// {first_label->[0,1,3], second_label->[2,4,5]}
694+
labelType first_label = 1;
695+
labelType second_label = 2;
696+
697+
GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
698+
GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
699+
GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
700+
GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
701+
GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
702+
GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
703+
ASSERT_EQ(VecSimIndex_IndexSize(index), n);
704+
705+
// Artificially reorder the internal ids to test that we make the right changes
706+
// when we have an id that appears twice in the array upon deleting the ids one by one.
707+
ASSERT_EQ(index->label_lookup_.at(second_label).size(), n / 2);
708+
index->label_lookup_.at(second_label)[0] = 4;
709+
index->label_lookup_.at(second_label)[1] = 2;
710+
index->label_lookup_.at(second_label)[2] = 5;
711+
712+
// Expect that the ids array of the second label will behave as following:
713+
// [|4, 2, 5] -> [4, |2, 4] -> [4, 2, |2] (where | marks the current position).
714+
index->deleteVector(second_label);
715+
ASSERT_EQ(index->indexLabelCount(), 1);
716+
ASSERT_EQ(VecSimIndex_IndexSize(index), n / 2);
717+
718+
// Check that the internal ids of the first label are as expected.
719+
auto ids = index->label_lookup_.at(first_label);
720+
ASSERT_EQ(ids.size(), n / 2);
721+
ASSERT_TRUE(std::find(ids.begin(), ids.end(), 0) != ids.end());
722+
ASSERT_TRUE(std::find(ids.begin(), ids.end(), 1) != ids.end());
723+
ASSERT_TRUE(std::find(ids.begin(), ids.end(), 2) != ids.end());
724+
index->deleteVector(first_label);
725+
ASSERT_EQ(index->indexLabelCount(), 0);
726+
ASSERT_EQ(VecSimIndex_IndexSize(index), 0);
727+
728+
VecSimIndex_Free(index);
729+
}
730+
685731
TYPED_TEST(HNSWMultiTest, remove_vector_after_replacing_block) {
686732
size_t dim = 4;
687733
size_t bs = 2;

0 commit comments

Comments
 (0)