-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscale_test.py
More file actions
75 lines (65 loc) · 2.65 KB
/
scale_test.py
File metadata and controls
75 lines (65 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
import time
from core.lattices import LeechLattice
from leech_hash import LeechHash
def scale_test():
print("--- Leech Lattice Scaling Test: 10,000 Embeddings ---")
lh = LeechHash()
num_vectors = 10000
dim = 24
# 1. Generate large synthetic dataset
num_vectors = 5000
print(f"Generating {num_vectors} synthetic 24D embeddings...")
np.random.seed(42)
# Use wider variance to test distinct concepts
base_points = np.random.randn(10, dim) * 5.0
data = []
labels = []
for i in range(num_vectors):
base_idx = i % 10
vec = base_points[base_idx] + np.random.normal(0, 0.1, dim)
data.append(vec)
labels.append(f"concept_{base_idx}_id_{i}")
# 2. Bulk Indexing
print(f"Indexing {num_vectors} vectors...")
start_time = time.time()
for i in range(num_vectors):
lh.index(labels[i], data[i])
index_time = time.time() - start_time
print(f"Indexing complete in {index_time:.2f} seconds ({num_vectors/index_time:.2f} vectors/sec)")
# 3. Retrieval Performance
print("\nTesting retrieval performance...")
query_vec = base_points[0] + np.random.normal(0, 0.1, dim)
# Exact lookup
start_time = time.time()
exact_results = lh.lookup(query_vec)
exact_time = time.time() - start_time
print(f"Exact lookup time: {exact_time*1000:.2f} ms")
print(f"Exact matches found: {len(exact_results)}")
# Neighborhood lookup (Recall boost)
print("Neighborhood lookup (checking ~200k potential buckets)...")
start_time = time.time()
# Note: Vectorized key-distance check is the fastest scaling method
if not lh.table:
neighborhood_results = []
else:
table_keys = np.array(list(lh.table.keys()))
diffs = table_keys - central_q
dists_sq = np.sum(diffs**2, axis=1)
# Using a small tolerance for floating point comparisons
neighbor_keys = table_keys[np.abs(dists_sq - 32.0) < 1e-5]
neighborhood_results = []
for nk in neighbor_keys:
neighborhood_results.extend(lh.table.get(tuple(nk.tolist()), []))
neighborhood_results = list(set(neighborhood_results))
neigh_time = time.time() - start_time
print(f"Neighborhood lookup time: {neigh_time:.2f} seconds")
print(f"Neighborhood matches found: {len(neighborhood_results)}")
# 4. Storage efficiency check
num_buckets = len(lh.table)
print(f"\nStorage Summary:")
print(f"Total Vectors: {num_vectors}")
print(f"Unique Buckets: {num_buckets}")
print(f"Avg Vectors per Bucket: {num_vectors/num_buckets:.2f}")
if __name__ == "__main__":
scale_test()