Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 94 additions & 1 deletion backend/services/indexer_optimized.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
# Search enhancement
from services.search_enhancer import SearchEnhancer

# Search V2 - Function-level extraction (Issue #68)
from services.search_v2 import TreeSitterExtractor, FunctionFilter, ExtractedFunction

# Observability
from services.observability import logger, trace_operation, track_time, capture_exception, add_breadcrumb, metrics

Expand Down Expand Up @@ -89,6 +92,14 @@ def __init__(self):
'typescript': self._create_parser(Language(tsjavascript.language())),
}

# Search V2: Initialize advanced tree-sitter extractor and filter (Issue #68)
self.tree_sitter_extractor = TreeSitterExtractor()
self.function_filter = FunctionFilter(
include_private=False,
include_dunders=True,
max_name_length=50
)

logger.info("OptimizedCodeIndexer initialized", model=EMBEDDING_MODEL)

def _create_parser(self, language) -> Parser:
Expand Down Expand Up @@ -339,7 +350,89 @@ async def _extract_functions_from_file(
except Exception as e:
logger.error("Error processing file", file_path=file_path, error=str(e))
return []


def extract_functions_v2(self, repo_path: str, max_functions: int = 5000) -> List[ExtractedFunction]:
"""Extract and filter functions using tree-sitter."""
from pathlib import Path

raw = self.tree_sitter_extractor.extract_from_repo(Path(repo_path), max_functions=max_functions)
filtered = self.function_filter.filter_functions(raw)

logger.info("V2 extraction", total=len(raw), kept=len(filtered))
return filtered

def _build_embedding_text(self, func: ExtractedFunction) -> str:
"""Build rich text for embedding."""
parts = [
f"Function: {func.qualified_name}",
f"Signature: {func.signature}",
]
if func.docstring:
parts.append(f"Description: {func.docstring[:500]}")
parts.append(f"Language: {func.language}")
parts.append(f"Code:\n{func.code[:2000]}")
return "\n".join(parts)

def _build_metadata(self, func: ExtractedFunction, repo_id: str) -> Dict:
"""Build Pinecone metadata from function."""
return {
"repo_id": repo_id,
"file_path": func.file_path,
"name": func.name,
"qualified_name": func.qualified_name,
"type": "method" if func.is_method else "function",
"code": func.code[:1000],
"signature": func.signature,
"start_line": func.start_line,
"end_line": func.end_line,
"language": func.language,
"class_name": func.class_name or "",
"docstring": (func.docstring or "")[:500],
"is_async": func.is_async,
}

async def index_repository_v2(self, repo_id: str, repo_path: str, progress_callback=None) -> int:
"""Index repository using V2 function-level extraction."""
start_time = time.time()
logger.info("V2 indexing started", repo_id=repo_id)

functions = self.extract_functions_v2(repo_path)
if not functions:
if progress_callback:
await progress_callback(0, 0, 0)
return 0

# generate embeddings
texts = [self._build_embedding_text(f) for f in functions]
embeddings = []

for i in range(0, len(texts), self.EMBEDDING_BATCH_SIZE):
batch = texts[i:i + self.EMBEDDING_BATCH_SIZE]
embeddings.extend(await self._create_embeddings_batch(batch))

if progress_callback:
await progress_callback(len(embeddings), len(functions), len(functions))

# build vectors
vectors = [
{
"id": hashlib.md5(func.id_string.encode()).hexdigest(),
"values": emb,
"metadata": self._build_metadata(func, repo_id)
}
for func, emb in zip(functions, embeddings)
]

# upsert to pinecone
for i in range(0, len(vectors), self.PINECONE_UPSERT_BATCH):
self.index.upsert(vectors=vectors[i:i + self.PINECONE_UPSERT_BATCH])

elapsed = time.time() - start_time
logger.info("V2 indexing complete", repo_id=repo_id, functions=len(functions), duration_s=round(elapsed, 2))
metrics.increment("indexing_v2_completed")

return len(functions)

async def semantic_search(
self,
query: str,
Expand Down
13 changes: 13 additions & 0 deletions backend/services/search_v2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Search V2: Function-level semantic search."""
from services.search_v2.types import ExtractedFunction, SearchResult, Language
from services.search_v2.tree_sitter_extractor import TreeSitterExtractor
from services.search_v2.function_filter import FunctionFilter, filter_functions

__all__ = [
"ExtractedFunction",
"SearchResult",
"Language",
"TreeSitterExtractor",
"FunctionFilter",
"filter_functions",
]
127 changes: 127 additions & 0 deletions backend/services/search_v2/function_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""Filter out low-quality functions from search index."""
from typing import List, Set
from services.search_v2.types import ExtractedFunction
from services.observability import logger

# Junk function name prefixes
JUNK_PREFIXES = (
'test_', 'time_', 'rand_', 'mock_', 'fake_', 'stub_',
'setup_', 'teardown_', 'fixture_', 'check_',
'_test_', '_time_', '_rand_', '_mock_',
'assert_', 'verify_', 'validate_test',
)

# Junk patterns anywhere in name
JUNK_PATTERNS = (
'_fixture', '_setup', '_teardown', '_helper_test',
'benchmark', '_bench', '_perf_',
'_random_data', '_test_data', '_sample_data',
'from_int_dict', 'from_test', '_for_test',
'_for_split', 'create_data_for',
'doesnt_use_', 'check_main',
)

# Junk file paths
JUNK_PATHS = (
'tests/', 'test/', 'testing/', '/tests/', '/test/', '/testing/',
'benchmarks/', 'asv_bench/', 'bench/',
'examples/', 'docs/', 'doc/',
'_testing/', 'conftest',
'fixtures/', '_fixtures/',
'mock/', 'mocks/', 'stubs/',
)

# Keep these even if they match junk patterns
PUBLIC_API: Set[str] = {
'read_csv', 'read_excel', 'read_json', 'read_parquet', 'read_sql',
'to_csv', 'to_excel', 'to_json', 'to_parquet', 'to_sql',
'merge', 'concat', 'groupby', 'pivot', 'melt',
'fillna', 'dropna', 'isna', 'notna',
'apply', 'map', 'transform', 'agg', 'aggregate',
'sort_values', 'sort_index', 'reset_index', 'set_index',
'authenticate', 'authorize', 'login', 'logout',
'validate', 'serialize', 'deserialize',
'create', 'read', 'update', 'delete',
'get', 'set', 'post', 'put', 'patch',
'connect', 'disconnect', 'send', 'receive',
'parse', 'format', 'convert', 'transform',
'load', 'save', 'export', 'import_',
'init', 'setup', 'configure', 'initialize',
}


class FunctionFilter:
"""Filter functions to keep only high-quality, searchable ones."""

def __init__(
self,
include_private: bool = False,
include_dunders: bool = True,
max_name_length: int = 50,
):
self.include_private = include_private
self.include_dunders = include_dunders
self.max_name_length = max_name_length

def filter_functions(self, functions: List[ExtractedFunction]) -> List[ExtractedFunction]:
original = len(functions)
filtered = [f for f in functions if self._keep(f)]

if original - len(filtered) > 0:
logger.debug("Filtered functions", kept=len(filtered), removed=original - len(filtered))

return filtered

def _keep(self, func: ExtractedFunction) -> bool:
name = func.name.lower()
path = func.file_path.lower()

# always keep public API
if any(api in name for api in PUBLIC_API):
return True

# skip junk paths
if any(p in path for p in JUNK_PATHS):
return False

# skip junk prefixes
if name.startswith(JUNK_PREFIXES):
return False

# skip junk patterns
if any(p in name for p in JUNK_PATTERNS):
return False

# skip long auto-generated names
if len(name) > self.max_name_length:
return False

# handle private functions
if func.name.startswith('_') and not func.name.startswith('__'):
return self.include_private

# handle dunders
if func.name.startswith('__') and func.name.endswith('__'):
return self.include_dunders

# skip test data generators
if name.startswith('make_') and ('test' in path or 'random' in name):
return False

return True

def get_stats(self, functions: List[ExtractedFunction]) -> dict:
quality = [f for f in functions if self._keep(f)]
return {
"total": len(functions),
"kept": len(quality),
"removed": len(functions) - len(quality),
}


default_filter = FunctionFilter()


def filter_functions(functions: List[ExtractedFunction]) -> List[ExtractedFunction]:
"""Filter using default settings."""
return default_filter.filter_functions(functions)
Loading