Skip to content

Commit 33355da

Browse files
committed
feat(search-v2): implement function-level chunking with tree-sitter
Issue: #68 This PR implements the foundation for Search V2 by adding function-level code extraction using tree-sitter AST parsing. Changes: - Add TreeSitterExtractor for accurate Python/JS/TS function extraction - Add FunctionFilter to remove test/junk functions - Add ExtractedFunction and SearchResult data models - Add index_repository_v2() method to indexer - Add comprehensive test suite (15 tests passing) Key features: - Qualified names (Class.method) for disambiguation - Docstring extraction for better search context - Async function detection - Decorator extraction - Quality filtering (removes test_*, mock_*, etc.) This is PR 1 of 4 in the Search V2 epic. Next: PR 2 will add AI-generated summaries (#69)
1 parent c14e1ca commit 33355da

6 files changed

Lines changed: 1425 additions & 0 deletions

File tree

backend/services/indexer_optimized.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131
# Search enhancement
3232
from services.search_enhancer import SearchEnhancer
3333

34+
# Search V2 - Function-level extraction (Issue #68)
35+
from services.search_v2 import TreeSitterExtractor, FunctionFilter, ExtractedFunction
36+
3437
# Observability
3538
from services.observability import logger, trace_operation, track_time, capture_exception, add_breadcrumb, metrics
3639

@@ -89,6 +92,14 @@ def __init__(self):
8992
'typescript': self._create_parser(Language(tsjavascript.language())),
9093
}
9194

95+
# Search V2: Initialize advanced tree-sitter extractor and filter (Issue #68)
96+
self.tree_sitter_extractor = TreeSitterExtractor()
97+
self.function_filter = FunctionFilter(
98+
include_private=False,
99+
include_dunders=True,
100+
max_name_length=50
101+
)
102+
92103
logger.info("OptimizedCodeIndexer initialized", model=EMBEDDING_MODEL)
93104

94105
def _create_parser(self, language) -> Parser:
@@ -340,6 +351,203 @@ async def _extract_functions_from_file(
340351
logger.error("Error processing file", file_path=file_path, error=str(e))
341352
return []
342353

354+
def extract_functions_v2(
355+
self,
356+
repo_path: str,
357+
max_functions: int = 5000
358+
) -> List[ExtractedFunction]:
359+
"""
360+
Extract functions using Search V2 tree-sitter extractor (Issue #68).
361+
362+
This is the improved extraction that:
363+
- Uses proper AST parsing for accuracy
364+
- Extracts qualified names (Class.method)
365+
- Filters out test/junk functions
366+
- Captures docstrings and decorators
367+
368+
Args:
369+
repo_path: Path to repository root
370+
max_functions: Maximum functions to extract
371+
372+
Returns:
373+
List of ExtractedFunction objects
374+
"""
375+
from pathlib import Path
376+
377+
# Extract all functions
378+
all_functions = self.tree_sitter_extractor.extract_from_repo(
379+
Path(repo_path),
380+
max_functions=max_functions
381+
)
382+
383+
# Filter to keep only quality functions
384+
quality_functions = self.function_filter.filter_functions(all_functions)
385+
386+
logger.info(
387+
"V2 extraction complete",
388+
total_extracted=len(all_functions),
389+
after_filter=len(quality_functions),
390+
filtered_out=len(all_functions) - len(quality_functions)
391+
)
392+
393+
return quality_functions
394+
395+
def _function_to_embedding_text(self, func: ExtractedFunction) -> str:
396+
"""
397+
Create rich embedding text from ExtractedFunction (Issue #68).
398+
399+
Combines signature, docstring, and code for better semantic matching.
400+
"""
401+
parts = []
402+
403+
# Add qualified name for disambiguation
404+
parts.append(f"Function: {func.qualified_name}")
405+
406+
# Add signature
407+
parts.append(f"Signature: {func.signature}")
408+
409+
# Add docstring if present
410+
if func.docstring:
411+
parts.append(f"Description: {func.docstring[:500]}")
412+
413+
# Add language context
414+
parts.append(f"Language: {func.language}")
415+
416+
# Add code (primary content)
417+
parts.append(f"Code:\n{func.code[:2000]}")
418+
419+
return "\n".join(parts)
420+
421+
def _function_to_pinecone_metadata(
422+
self,
423+
func: ExtractedFunction,
424+
repo_id: str
425+
) -> Dict:
426+
"""
427+
Convert ExtractedFunction to Pinecone metadata (Issue #68).
428+
429+
Updated schema with qualified names and additional fields.
430+
"""
431+
return {
432+
"repo_id": repo_id,
433+
"file_path": func.file_path,
434+
"name": func.name,
435+
"qualified_name": func.qualified_name,
436+
"type": "method" if func.is_method else "function",
437+
"code": func.code[:1000], # Truncate for metadata limits
438+
"signature": func.signature,
439+
"start_line": func.start_line,
440+
"end_line": func.end_line,
441+
"language": func.language,
442+
"class_name": func.class_name or "",
443+
"docstring": (func.docstring or "")[:500],
444+
"is_async": func.is_async,
445+
}
446+
447+
async def index_repository_v2(
448+
self,
449+
repo_id: str,
450+
repo_path: str,
451+
progress_callback=None
452+
) -> int:
453+
"""
454+
Index repository using Search V2 extraction (Issue #68).
455+
456+
This is the improved indexing that uses:
457+
- Function-level chunking with qualified names
458+
- Quality filtering to remove junk
459+
- Rich embedding text with docstrings
460+
461+
Args:
462+
repo_id: Unique repository identifier
463+
repo_path: Path to repository root
464+
progress_callback: Optional async callback(files, functions, total)
465+
466+
Returns:
467+
Number of functions indexed
468+
"""
469+
from services.observability import set_operation_context
470+
471+
set_operation_context("indexing_v2", repo_id=repo_id)
472+
add_breadcrumb("Starting V2 repository indexing", category="indexing", repo_id=repo_id)
473+
474+
start_time = time.time()
475+
logger.info("Starting V2 indexing", repo_id=repo_id, path=repo_path)
476+
477+
# Step 1: Extract functions using V2 extractor
478+
functions = self.extract_functions_v2(repo_path)
479+
480+
if not functions:
481+
logger.warning("No functions extracted", repo_id=repo_id)
482+
if progress_callback:
483+
await progress_callback(0, 0, 0)
484+
return 0
485+
486+
logger.info("Functions extracted", repo_id=repo_id, count=len(functions))
487+
488+
# Step 2: Generate embeddings in batches
489+
embedding_texts = [self._function_to_embedding_text(f) for f in functions]
490+
491+
all_embeddings = []
492+
with track_time("embedding_generation_v2", repo_id=repo_id, total=len(embedding_texts)):
493+
for i in range(0, len(embedding_texts), self.EMBEDDING_BATCH_SIZE):
494+
batch_texts = embedding_texts[i:i + self.EMBEDDING_BATCH_SIZE]
495+
batch_embeddings = await self._create_embeddings_batch(batch_texts)
496+
all_embeddings.extend(batch_embeddings)
497+
498+
if progress_callback:
499+
await progress_callback(
500+
len(all_embeddings),
501+
len(functions),
502+
len(functions)
503+
)
504+
505+
logger.debug(
506+
"Embeddings generated",
507+
progress=len(all_embeddings),
508+
total=len(embedding_texts)
509+
)
510+
511+
# Step 3: Prepare vectors for Pinecone
512+
vectors_to_upsert = []
513+
514+
for func, embedding in zip(functions, all_embeddings):
515+
func_id = hashlib.md5(func.id_string.encode()).hexdigest()
516+
517+
vectors_to_upsert.append({
518+
"id": func_id,
519+
"values": embedding,
520+
"metadata": self._function_to_pinecone_metadata(func, repo_id)
521+
})
522+
523+
# Step 4: Upsert to Pinecone in batches
524+
add_breadcrumb("Uploading to Pinecone", category="indexing", vector_count=len(vectors_to_upsert))
525+
526+
with track_time("pinecone_upload_v2", repo_id=repo_id, vectors=len(vectors_to_upsert)):
527+
for i in range(0, len(vectors_to_upsert), self.PINECONE_UPSERT_BATCH):
528+
batch = vectors_to_upsert[i:i + self.PINECONE_UPSERT_BATCH]
529+
self.index.upsert(vectors=batch)
530+
logger.debug(
531+
"Vectors uploaded",
532+
progress=min(i + self.PINECONE_UPSERT_BATCH, len(vectors_to_upsert)),
533+
total=len(vectors_to_upsert)
534+
)
535+
536+
elapsed = time.time() - start_time
537+
speed = len(functions) / elapsed if elapsed > 0 else 0
538+
539+
logger.info(
540+
"V2 indexing complete",
541+
repo_id=repo_id,
542+
functions=len(functions),
543+
duration_s=round(elapsed, 2),
544+
speed=round(speed, 1)
545+
)
546+
metrics.increment("indexing_v2_completed")
547+
metrics.timing("indexing_v2_duration_s", elapsed)
548+
549+
return len(functions)
550+
343551
async def semantic_search(
344552
self,
345553
query: str,
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""
2+
Search V2 Module
3+
Function-level semantic search with Triple Fusion
4+
5+
This module implements the improved search system from the
6+
codeintel-research experiments, achieving 85%+ accuracy.
7+
8+
Key components:
9+
- TreeSitterExtractor: AST-based function extraction
10+
- FunctionFilter: Quality filtering to remove junk
11+
- Types: Data models for functions and search results
12+
13+
Issue: #67 - Semantic Search V2 Epic
14+
"""
15+
from services.search_v2.types import (
16+
ExtractedFunction,
17+
SearchResult,
18+
Language,
19+
)
20+
from services.search_v2.tree_sitter_extractor import TreeSitterExtractor
21+
from services.search_v2.function_filter import (
22+
FunctionFilter,
23+
filter_functions,
24+
)
25+
26+
__all__ = [
27+
# Types
28+
"ExtractedFunction",
29+
"SearchResult",
30+
"Language",
31+
32+
# Extractors
33+
"TreeSitterExtractor",
34+
35+
# Filters
36+
"FunctionFilter",
37+
"filter_functions",
38+
]

0 commit comments

Comments
 (0)