-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path4_retrieve_docs.py
More file actions
61 lines (47 loc) · 1.7 KB
/
4_retrieve_docs.py
File metadata and controls
61 lines (47 loc) · 1.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import sqlite3
import json
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
from typing import List, Dict, Tuple
from sklearn.metrics.pairwise import cosine_similarity
load_dotenv()
openai_client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY")
)
def generate_embeddings(texts: List[str]) -> List[List[float]]:
response = openai_client.embeddings.create(
model="text-embedding-3-small",
dimensions=256,
input=texts
)
return [item.embedding for item in response.data]
def cosine_distance(a: List[float], b: List[float]) -> float:
return 1 - cosine_similarity([a], [b])[0][0]
def retrieve_documents(query: str, limit: int = 3) -> List[Dict]:
embeddings = generate_embeddings([query])
query_embedding = embeddings[0]
conn = sqlite3.connect('documents.db')
cursor = conn.cursor()
cursor.execute('SELECT id, name, content, embedding FROM documents')
rows = cursor.fetchall()
conn.close()
documents_with_similarity = []
for row in rows:
doc_id, name, content, embedding_json = row
doc_embedding = json.loads(embedding_json)
similarity = 1 - cosine_distance(query_embedding, doc_embedding)
documents_with_similarity.append({
'id': doc_id,
'name': name,
'content': content,
'similarity': similarity
})
documents_with_similarity.sort(key=lambda x: x['similarity'], reverse=True)
results = documents_with_similarity[:limit]
print(results)
return results
if __name__ == "__main__":
docs = retrieve_documents("Tell me about rhinos")
print(f"Retrieved {len(docs)} documents")