|
31 | 31 | # Search enhancement |
32 | 32 | from services.search_enhancer import SearchEnhancer |
33 | 33 |
|
| 34 | +# Search V2 - Function-level extraction (Issue #68) |
| 35 | +from services.search_v2 import TreeSitterExtractor, FunctionFilter, ExtractedFunction |
| 36 | + |
34 | 37 | # Observability |
35 | 38 | from services.observability import logger, trace_operation, track_time, capture_exception, add_breadcrumb, metrics |
36 | 39 |
|
@@ -89,6 +92,14 @@ def __init__(self): |
89 | 92 | 'typescript': self._create_parser(Language(tsjavascript.language())), |
90 | 93 | } |
91 | 94 |
|
| 95 | + # Search V2: Initialize advanced tree-sitter extractor and filter (Issue #68) |
| 96 | + self.tree_sitter_extractor = TreeSitterExtractor() |
| 97 | + self.function_filter = FunctionFilter( |
| 98 | + include_private=False, |
| 99 | + include_dunders=True, |
| 100 | + max_name_length=50 |
| 101 | + ) |
| 102 | + |
92 | 103 | logger.info("OptimizedCodeIndexer initialized", model=EMBEDDING_MODEL) |
93 | 104 |
|
94 | 105 | def _create_parser(self, language) -> Parser: |
@@ -340,6 +351,203 @@ async def _extract_functions_from_file( |
340 | 351 | logger.error("Error processing file", file_path=file_path, error=str(e)) |
341 | 352 | return [] |
342 | 353 |
|
| 354 | + def extract_functions_v2( |
| 355 | + self, |
| 356 | + repo_path: str, |
| 357 | + max_functions: int = 5000 |
| 358 | + ) -> List[ExtractedFunction]: |
| 359 | + """ |
| 360 | + Extract functions using Search V2 tree-sitter extractor (Issue #68). |
| 361 | + |
| 362 | + This is the improved extraction that: |
| 363 | + - Uses proper AST parsing for accuracy |
| 364 | + - Extracts qualified names (Class.method) |
| 365 | + - Filters out test/junk functions |
| 366 | + - Captures docstrings and decorators |
| 367 | + |
| 368 | + Args: |
| 369 | + repo_path: Path to repository root |
| 370 | + max_functions: Maximum functions to extract |
| 371 | + |
| 372 | + Returns: |
| 373 | + List of ExtractedFunction objects |
| 374 | + """ |
| 375 | + from pathlib import Path |
| 376 | + |
| 377 | + # Extract all functions |
| 378 | + all_functions = self.tree_sitter_extractor.extract_from_repo( |
| 379 | + Path(repo_path), |
| 380 | + max_functions=max_functions |
| 381 | + ) |
| 382 | + |
| 383 | + # Filter to keep only quality functions |
| 384 | + quality_functions = self.function_filter.filter_functions(all_functions) |
| 385 | + |
| 386 | + logger.info( |
| 387 | + "V2 extraction complete", |
| 388 | + total_extracted=len(all_functions), |
| 389 | + after_filter=len(quality_functions), |
| 390 | + filtered_out=len(all_functions) - len(quality_functions) |
| 391 | + ) |
| 392 | + |
| 393 | + return quality_functions |
| 394 | + |
| 395 | + def _function_to_embedding_text(self, func: ExtractedFunction) -> str: |
| 396 | + """ |
| 397 | + Create rich embedding text from ExtractedFunction (Issue #68). |
| 398 | + |
| 399 | + Combines signature, docstring, and code for better semantic matching. |
| 400 | + """ |
| 401 | + parts = [] |
| 402 | + |
| 403 | + # Add qualified name for disambiguation |
| 404 | + parts.append(f"Function: {func.qualified_name}") |
| 405 | + |
| 406 | + # Add signature |
| 407 | + parts.append(f"Signature: {func.signature}") |
| 408 | + |
| 409 | + # Add docstring if present |
| 410 | + if func.docstring: |
| 411 | + parts.append(f"Description: {func.docstring[:500]}") |
| 412 | + |
| 413 | + # Add language context |
| 414 | + parts.append(f"Language: {func.language}") |
| 415 | + |
| 416 | + # Add code (primary content) |
| 417 | + parts.append(f"Code:\n{func.code[:2000]}") |
| 418 | + |
| 419 | + return "\n".join(parts) |
| 420 | + |
| 421 | + def _function_to_pinecone_metadata( |
| 422 | + self, |
| 423 | + func: ExtractedFunction, |
| 424 | + repo_id: str |
| 425 | + ) -> Dict: |
| 426 | + """ |
| 427 | + Convert ExtractedFunction to Pinecone metadata (Issue #68). |
| 428 | + |
| 429 | + Updated schema with qualified names and additional fields. |
| 430 | + """ |
| 431 | + return { |
| 432 | + "repo_id": repo_id, |
| 433 | + "file_path": func.file_path, |
| 434 | + "name": func.name, |
| 435 | + "qualified_name": func.qualified_name, |
| 436 | + "type": "method" if func.is_method else "function", |
| 437 | + "code": func.code[:1000], # Truncate for metadata limits |
| 438 | + "signature": func.signature, |
| 439 | + "start_line": func.start_line, |
| 440 | + "end_line": func.end_line, |
| 441 | + "language": func.language, |
| 442 | + "class_name": func.class_name or "", |
| 443 | + "docstring": (func.docstring or "")[:500], |
| 444 | + "is_async": func.is_async, |
| 445 | + } |
| 446 | + |
| 447 | + async def index_repository_v2( |
| 448 | + self, |
| 449 | + repo_id: str, |
| 450 | + repo_path: str, |
| 451 | + progress_callback=None |
| 452 | + ) -> int: |
| 453 | + """ |
| 454 | + Index repository using Search V2 extraction (Issue #68). |
| 455 | + |
| 456 | + This is the improved indexing that uses: |
| 457 | + - Function-level chunking with qualified names |
| 458 | + - Quality filtering to remove junk |
| 459 | + - Rich embedding text with docstrings |
| 460 | + |
| 461 | + Args: |
| 462 | + repo_id: Unique repository identifier |
| 463 | + repo_path: Path to repository root |
| 464 | + progress_callback: Optional async callback(files, functions, total) |
| 465 | + |
| 466 | + Returns: |
| 467 | + Number of functions indexed |
| 468 | + """ |
| 469 | + from services.observability import set_operation_context |
| 470 | + |
| 471 | + set_operation_context("indexing_v2", repo_id=repo_id) |
| 472 | + add_breadcrumb("Starting V2 repository indexing", category="indexing", repo_id=repo_id) |
| 473 | + |
| 474 | + start_time = time.time() |
| 475 | + logger.info("Starting V2 indexing", repo_id=repo_id, path=repo_path) |
| 476 | + |
| 477 | + # Step 1: Extract functions using V2 extractor |
| 478 | + functions = self.extract_functions_v2(repo_path) |
| 479 | + |
| 480 | + if not functions: |
| 481 | + logger.warning("No functions extracted", repo_id=repo_id) |
| 482 | + if progress_callback: |
| 483 | + await progress_callback(0, 0, 0) |
| 484 | + return 0 |
| 485 | + |
| 486 | + logger.info("Functions extracted", repo_id=repo_id, count=len(functions)) |
| 487 | + |
| 488 | + # Step 2: Generate embeddings in batches |
| 489 | + embedding_texts = [self._function_to_embedding_text(f) for f in functions] |
| 490 | + |
| 491 | + all_embeddings = [] |
| 492 | + with track_time("embedding_generation_v2", repo_id=repo_id, total=len(embedding_texts)): |
| 493 | + for i in range(0, len(embedding_texts), self.EMBEDDING_BATCH_SIZE): |
| 494 | + batch_texts = embedding_texts[i:i + self.EMBEDDING_BATCH_SIZE] |
| 495 | + batch_embeddings = await self._create_embeddings_batch(batch_texts) |
| 496 | + all_embeddings.extend(batch_embeddings) |
| 497 | + |
| 498 | + if progress_callback: |
| 499 | + await progress_callback( |
| 500 | + len(all_embeddings), |
| 501 | + len(functions), |
| 502 | + len(functions) |
| 503 | + ) |
| 504 | + |
| 505 | + logger.debug( |
| 506 | + "Embeddings generated", |
| 507 | + progress=len(all_embeddings), |
| 508 | + total=len(embedding_texts) |
| 509 | + ) |
| 510 | + |
| 511 | + # Step 3: Prepare vectors for Pinecone |
| 512 | + vectors_to_upsert = [] |
| 513 | + |
| 514 | + for func, embedding in zip(functions, all_embeddings): |
| 515 | + func_id = hashlib.md5(func.id_string.encode()).hexdigest() |
| 516 | + |
| 517 | + vectors_to_upsert.append({ |
| 518 | + "id": func_id, |
| 519 | + "values": embedding, |
| 520 | + "metadata": self._function_to_pinecone_metadata(func, repo_id) |
| 521 | + }) |
| 522 | + |
| 523 | + # Step 4: Upsert to Pinecone in batches |
| 524 | + add_breadcrumb("Uploading to Pinecone", category="indexing", vector_count=len(vectors_to_upsert)) |
| 525 | + |
| 526 | + with track_time("pinecone_upload_v2", repo_id=repo_id, vectors=len(vectors_to_upsert)): |
| 527 | + for i in range(0, len(vectors_to_upsert), self.PINECONE_UPSERT_BATCH): |
| 528 | + batch = vectors_to_upsert[i:i + self.PINECONE_UPSERT_BATCH] |
| 529 | + self.index.upsert(vectors=batch) |
| 530 | + logger.debug( |
| 531 | + "Vectors uploaded", |
| 532 | + progress=min(i + self.PINECONE_UPSERT_BATCH, len(vectors_to_upsert)), |
| 533 | + total=len(vectors_to_upsert) |
| 534 | + ) |
| 535 | + |
| 536 | + elapsed = time.time() - start_time |
| 537 | + speed = len(functions) / elapsed if elapsed > 0 else 0 |
| 538 | + |
| 539 | + logger.info( |
| 540 | + "V2 indexing complete", |
| 541 | + repo_id=repo_id, |
| 542 | + functions=len(functions), |
| 543 | + duration_s=round(elapsed, 2), |
| 544 | + speed=round(speed, 1) |
| 545 | + ) |
| 546 | + metrics.increment("indexing_v2_completed") |
| 547 | + metrics.timing("indexing_v2_duration_s", elapsed) |
| 548 | + |
| 549 | + return len(functions) |
| 550 | + |
343 | 551 | async def semantic_search( |
344 | 552 | self, |
345 | 553 | query: str, |
|
0 commit comments