11"""
22Optimized Code Indexer
33High-performance indexing with batch embeddings and parallel processing
4+
5+ Improvements (v2):
6+ - Uses text-embedding-3-large for better code understanding
7+ - Rich embedding text with docstrings, params, and context
8+ - Query expansion for better recall
9+ - Keyword boosting for exact matches
410"""
511import os
612from pathlib import Path
2228from dotenv import load_dotenv
2329import time
2430
31+ # Search enhancement
32+ from services .search_enhancer import SearchEnhancer
33+
2534load_dotenv ()
2635
36+ # Configuration
37+ # Note: If using existing Pinecone index, match the dimension (1536 for small, 3072 for large)
38+ EMBEDDING_MODEL = os .getenv ("EMBEDDING_MODEL" , "text-embedding-3-small" )
39+ EMBEDDING_DIMENSIONS = 3072 if "large" in EMBEDDING_MODEL else 1536
40+
2741
2842class OptimizedCodeIndexer :
2943 """Index and search code using semantic embeddings - OPTIMIZED"""
@@ -37,17 +51,25 @@ def __init__(self):
3751 # Initialize OpenAI
3852 self .openai_client = AsyncOpenAI (api_key = os .getenv ("OPENAI_API_KEY" ))
3953
54+ # Initialize search enhancer
55+ self .search_enhancer = SearchEnhancer (self .openai_client )
56+
4057 # Initialize Pinecone
4158 pc = Pinecone (api_key = os .getenv ("PINECONE_API_KEY" ))
4259
4360 index_name = os .getenv ("PINECONE_INDEX_NAME" , "codeintel" )
4461
45- # Create index if it doesn't exist
46- if index_name not in pc .list_indexes ().names ():
47- print (f"Creating Pinecone index: { index_name } " )
62+ # Check if index exists and has correct dimensions
63+ existing_indexes = pc .list_indexes ().names ()
64+ if index_name in existing_indexes :
65+ # Use existing index (dimension already set)
66+ index_info = pc .describe_index (index_name )
67+ print (f"📊 Using existing Pinecone index: { index_name } (dim={ index_info .dimension } )" )
68+ else :
69+ print (f"Creating Pinecone index: { index_name } with dimension { EMBEDDING_DIMENSIONS } " )
4870 pc .create_index (
4971 name = index_name ,
50- dimension = 1536 , # OpenAI embedding dimension
72+ dimension = EMBEDDING_DIMENSIONS ,
5173 metric = "cosine" ,
5274 spec = ServerlessSpec (
5375 cloud = "aws" ,
@@ -64,7 +86,7 @@ def __init__(self):
6486 'typescript' : self ._create_parser (Language (tsjavascript .language ())),
6587 }
6688
67- print ("✅ OptimizedCodeIndexer initialized!" )
89+ print (f "✅ OptimizedCodeIndexer initialized! (model: { EMBEDDING_MODEL } ) " )
6890
6991 def _create_parser (self , language ) -> Parser :
7092 """Create a tree-sitter parser"""
@@ -110,16 +132,16 @@ def _discover_code_files(self, repo_path: str) -> List[Path]:
110132 return code_files
111133
112134 async def _create_embeddings_batch (self , texts : List [str ]) -> List [List [float ]]:
113- """Generate embeddings in batch - MUCH FASTER """
135+ """Generate embeddings in batch using configured model """
114136 if not texts :
115137 return []
116138
117139 try :
118- # Truncate texts if too long
140+ # Truncate texts if too long (8191 token limit)
119141 truncated_texts = [text [:8000 ] for text in texts ]
120142
121143 response = await self .openai_client .embeddings .create (
122- model = "text-embedding-3-small" ,
144+ model = EMBEDDING_MODEL ,
123145 input = truncated_texts
124146 )
125147
@@ -129,7 +151,7 @@ async def _create_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
129151 except Exception as e :
130152 print (f"❌ Error creating batch embeddings: { e } " )
131153 # Return zero vectors on error
132- return [[0.0 ] * 1536 for _ in texts ]
154+ return [[0.0 ] * EMBEDDING_DIMENSIONS for _ in texts ]
133155
134156 def _extract_functions (self , tree_node , source_code : bytes ) -> List [Dict ]:
135157 """Extract function/class definitions from AST"""
@@ -214,9 +236,11 @@ async def index_repository(self, repo_id: str, repo_path: str):
214236
215237 # Generate embeddings in BATCHES (this is the key optimization)
216238 print (f"\n 🧠 Generating embeddings in batches of { self .EMBEDDING_BATCH_SIZE } ..." )
239+ print (f" Using model: { EMBEDDING_MODEL } " )
217240
241+ # Create rich embedding texts using search enhancer
218242 embedding_texts = [
219- f"Function: { func [ 'name' ] } \n Type: { func [ 'type' ] } \n \n { func [ 'code' ][: 1000 ] } "
243+ self . search_enhancer . create_rich_embedding_text ( func )
220244 for func in all_functions_data
221245 ]
222246
@@ -304,23 +328,41 @@ async def semantic_search(
304328 self ,
305329 query : str ,
306330 repo_id : str ,
307- max_results : int = 10
331+ max_results : int = 10 ,
332+ use_query_expansion : bool = True ,
333+ use_reranking : bool = True
308334 ) -> List [Dict ]:
309- """Search code using semantic similarity"""
335+ """
336+ Search code using semantic similarity with enhancements.
337+
338+ Args:
339+ query: Search query
340+ repo_id: Repository to search in
341+ max_results: Number of results to return
342+ use_query_expansion: Expand query with related terms
343+ use_reranking: Rerank results with keyword boosting
344+ """
310345 try :
311- # Generate query embedding (single request)
312- query_embeddings = await self ._create_embeddings_batch ([query ])
346+ # Step 1: Query expansion (adds related programming terms)
347+ search_query = query
348+ if use_query_expansion :
349+ search_query = await self .search_enhancer .expand_query (query )
350+ print (f"🔍 Expanded query: { search_query [:100 ]} ..." )
351+
352+ # Step 2: Generate query embedding
353+ query_embeddings = await self ._create_embeddings_batch ([search_query ])
313354 query_embedding = query_embeddings [0 ]
314355
315- # Search Pinecone
356+ # Step 3: Search Pinecone (retrieve more for reranking)
357+ retrieve_count = max_results * 3 if use_reranking else max_results
316358 results = self .index .query (
317359 vector = query_embedding ,
318360 filter = {"repo_id" : {"$eq" : repo_id }},
319- top_k = max_results ,
361+ top_k = retrieve_count ,
320362 include_metadata = True
321363 )
322364
323- # Format results
365+ # Step 4: Format results
324366 formatted_results = []
325367 for match in results .matches :
326368 formatted_results .append ({
@@ -334,7 +376,14 @@ async def semantic_search(
334376 "line_end" : match .metadata .get ("end_line" , 0 ),
335377 })
336378
337- return formatted_results
379+ # Step 5: Rerank with keyword boosting
380+ if use_reranking and formatted_results :
381+ formatted_results = self .search_enhancer .rerank_results (
382+ query , # Use original query for keyword matching
383+ formatted_results
384+ )
385+
386+ return formatted_results [:max_results ]
338387
339388 except Exception as e :
340389 print (f"❌ Error searching: { e } " )
@@ -441,8 +490,9 @@ async def index_repository_with_progress(
441490 # Generate embeddings in BATCHES
442491 print (f"\n 🧠 Generating embeddings in batches of { self .EMBEDDING_BATCH_SIZE } ..." )
443492
493+ # Create rich embedding texts using search enhancer
444494 embedding_texts = [
445- f"Function: { func [ 'name' ] } \n Type: { func [ 'type' ] } \n \n { func [ 'code' ][: 1000 ] } "
495+ self . search_enhancer . create_rich_embedding_text ( func )
446496 for func in all_functions_data
447497 ]
448498
@@ -553,8 +603,9 @@ async def incremental_index_repository(
553603 # Generate embeddings in batches
554604 print (f"\n 🧠 Generating embeddings for { len (all_functions_data )} functions..." )
555605
606+ # Create rich embedding texts using search enhancer
556607 embedding_texts = [
557- f"Function: { func [ 'name' ] } \n Type: { func [ 'type' ] } \n \n { func [ 'code' ][: 1000 ] } "
608+ self . search_enhancer . create_rich_embedding_text ( func )
558609 for func in all_functions_data
559610 ]
560611
@@ -565,7 +616,6 @@ async def incremental_index_repository(
565616 all_embeddings .extend (batch_embeddings )
566617
567618 # Prepare vectors
568- import hashlib
569619 vectors_to_upsert = []
570620
571621 for func_data , embedding in zip (all_functions_data , all_embeddings ):
0 commit comments