-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathembeddings_manager.py
More file actions
67 lines (59 loc) · 2.6 KB
/
embeddings_manager.py
File metadata and controls
67 lines (59 loc) · 2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
"""
Embeddings and semantic search manager using ChromaDB and sentence-transformers.
Persists embeddings to a local directory and provides batch upsert/query.
"""
from __future__ import annotations
from typing import Iterable, List, Dict, Optional, Tuple
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from config import CrawlerConfig
class EmbeddingsManager:
def __init__(self, config: CrawlerConfig) -> None:
self.config = config
self.client = chromadb.PersistentClient(
path=str(config.CHROMA_PERSIST_DIR),
settings=Settings(allow_reset=False),
)
self.embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=config.EMBEDDINGS_MODEL
)
self.collection = self.client.get_or_create_collection(
name=config.CHROMA_COLLECTION, embedding_function=self.embed_fn
)
def upsert_pages(self, items: Iterable[Tuple[str, str, Dict]]):
"""
Upsert pages into the vector store.
items: iterable of (doc_id, content, metadata)
"""
ids: List[str] = []
docs: List[str] = []
metas: List[Dict] = []
for i, c, m in items:
ids.append(i)
docs.append(c)
metas.append(m)
if ids:
self.collection.upsert(ids=ids, documents=docs, metadatas=metas)
def upsert_page(self, doc_id: str, content: str, metadata: Optional[Dict] = None):
self.collection.upsert(ids=[doc_id], documents=[content], metadatas=[metadata or {}])
def query_similar(self, content: str, top_k: int, threshold: float = 0.75, exclude_ids: Optional[List[str]] = None) -> List[Dict]:
res = self.collection.query(
query_texts=[content],
n_results=top_k + (len(exclude_ids) if exclude_ids else 0),
)
results: List[Dict] = []
ids = res.get("ids", [[]])[0]
docs = res.get("documents", [[]])[0]
dists = res.get("distances", [[]]) or res.get("embeddings", [[]]) # distances present when similarity function used
dlist = (dists[0] if dists else [])
for idx, doc_id in enumerate(ids):
if exclude_ids and doc_id in exclude_ids:
continue
score = 1.0 - float(dlist[idx]) if dlist else 0.0 # Chroma returns distance; convert to similarity if available
if score >= threshold:
results.append({"id": doc_id, "score": score, "preview": docs[idx]})
if len(results) >= top_k:
break
return results