Skip to content

Commit d3ffafa

Browse files
authored
Merge pull request #266 from DevanshuNEU/feat/subset-indexing
feat: monorepo subset indexing -- include_paths support (OPE-106)
2 parents 7a563d3 + f4d61c9 commit d3ffafa

2 files changed

Lines changed: 151 additions & 39 deletions

File tree

backend/routes/repos.py

Lines changed: 97 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""Repository management routes - CRUD and indexing."""
22
from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect, Depends, BackgroundTasks
3-
from pydantic import BaseModel
4-
from typing import Optional
3+
from pydantic import BaseModel, field_validator
4+
from typing import List, Optional
5+
from pathlib import Path
56
import hashlib
67
import time
78
import asyncio
@@ -177,6 +178,56 @@ async def delete_repository(
177178
raise HTTPException(status_code=500, detail="Failed to delete repository")
178179

179180

181+
def _scan_directories(local_path: Path) -> List[dict]:
182+
"""Scan top-level directories and count code files in each.
183+
184+
Runs synchronously -- call via asyncio.to_thread() from async handlers
185+
to avoid blocking the event loop on large repos.
186+
"""
187+
skip = {"node_modules", ".git", "__pycache__", "venv", ".next", "dist", "build"}
188+
extensions = {".py", ".js", ".jsx", ".ts", ".tsx"}
189+
dirs = []
190+
for item in sorted(local_path.iterdir()):
191+
if item.is_dir() and item.name not in skip and not item.name.startswith("."):
192+
file_count = sum(
193+
1 for f in item.rglob("*")
194+
if f.is_file() and f.suffix in extensions
195+
and not any(s in f.parts for s in skip)
196+
)
197+
dirs.append({
198+
"name": item.name,
199+
"path": str(item.relative_to(local_path)),
200+
"file_count": file_count,
201+
})
202+
return dirs
203+
204+
205+
@router.get("/{repo_id}/directories")
206+
async def get_repo_directories(
207+
repo_id: str,
208+
auth: AuthContext = Depends(require_auth),
209+
) -> dict:
210+
"""Return the top-level directory tree of a cloned repo.
211+
212+
Used for monorepo subset selection -- lets the user pick which
213+
directories to index instead of the entire repo.
214+
"""
215+
repo = get_repo_or_404(repo_id, auth.user_id)
216+
local_path = Path(repo["local_path"])
217+
218+
if not local_path.exists():
219+
raise HTTPException(status_code=404, detail="Repo not cloned yet")
220+
221+
dirs = await asyncio.to_thread(_scan_directories, local_path)
222+
223+
return {
224+
"repo_id": repo_id,
225+
"repo_name": repo.get("name", local_path.name),
226+
"directories": dirs,
227+
"total_directories": len(dirs),
228+
}
229+
230+
180231
@router.post("/{repo_id}/index")
181232
async def index_repository(
182233
repo_id: str,
@@ -275,7 +326,8 @@ async def _run_async_indexing(
275326
repo_id: str,
276327
repo: dict,
277328
user_id: str,
278-
incremental: bool = True
329+
incremental: bool = True,
330+
include_paths: Optional[List[str]] = None,
279331
):
280332
"""
281333
Background task for async indexing with real-time progress.
@@ -298,9 +350,12 @@ async def _run_async_indexing(
298350
publisher.publish_progress(repo_id, 0, 1, 0, "Starting...")
299351

300352
# Check for incremental
353+
# Skip incremental when include_paths is set -- incremental_index_repository
354+
# uses git diff which doesn't understand subset boundaries
301355
last_commit = repo_manager.get_last_indexed_commit(repo_id)
356+
can_incremental = incremental and last_commit and not include_paths
302357

303-
if incremental and last_commit:
358+
if can_incremental:
304359
logger.info("Async INCREMENTAL indexing", repo_id=repo_id, last_commit=last_commit[:8])
305360
total_functions = await indexer.incremental_index_repository(
306361
repo_id,
@@ -349,7 +404,8 @@ async def progress_callback(
349404
total_functions = await indexer.index_repository_with_progress(
350405
repo_id,
351406
repo["local_path"],
352-
progress_callback
407+
progress_callback,
408+
include_paths=include_paths,
353409
)
354410
total_files = tracked_total_files
355411
index_type = "full"
@@ -400,11 +456,35 @@ async def progress_callback(
400456
)
401457

402458

459+
class IndexConfig(BaseModel):
460+
"""Optional config for indexing -- supports monorepo subset selection."""
461+
include_paths: Optional[List[str]] = None # e.g. ["packages/effect", "packages/schema"]
462+
incremental: bool = True
463+
464+
@field_validator("include_paths", mode="before")
465+
@classmethod
466+
def sanitize_paths(cls, v: Optional[List[str]]) -> Optional[List[str]]:
467+
"""Reject path traversal, empty strings, and normalize slashes."""
468+
if v is None:
469+
return v
470+
cleaned = []
471+
for item in v:
472+
if not isinstance(item, str):
473+
raise ValueError(f"include_paths entries must be strings, got {type(item).__name__}")
474+
item = item.replace("\\", "/").strip().strip("/")
475+
if not item:
476+
raise ValueError("include_paths entries must not be empty")
477+
if ".." in item.split("/"):
478+
raise ValueError(f"Path traversal not allowed: {item}")
479+
cleaned.append(item)
480+
return cleaned
481+
482+
403483
@router.post("/{repo_id}/index/async", status_code=202)
404484
async def index_repository_async(
405485
repo_id: str,
406486
background_tasks: BackgroundTasks,
407-
incremental: bool = True,
487+
config: IndexConfig = IndexConfig(),
408488
auth: AuthContext = Depends(require_auth)
409489
):
410490
"""
@@ -463,14 +543,16 @@ async def index_repository_async(
463543
repo_id,
464544
repo,
465545
user_id,
466-
incremental
546+
incremental=config.incremental,
547+
include_paths=config.include_paths,
467548
)
468549

469550
return {
470551
"status": "indexing",
471552
"repo_id": repo_id,
472553
"message": "Indexing started. Connect to WebSocket for progress.",
473-
"websocket_url": f"/api/v1/ws/repos/{repo_id}/indexing"
554+
"websocket_url": f"/api/v1/ws/repos/{repo_id}/indexing",
555+
"include_paths": config.include_paths,
474556
}
475557

476558
except HTTPException:
@@ -500,7 +582,13 @@ async def _authenticate_websocket(websocket: WebSocket) -> Optional[dict]:
500582
# Note: WebSocket routes need to be registered on the main app, not router
501583
# This function is exported and called from main.py
502584
async def websocket_index(websocket: WebSocket, repo_id: str):
503-
"""Real-time repository indexing with progress updates."""
585+
"""Real-time repository indexing with progress updates.
586+
587+
NOTE: This WebSocket-direct-indexing path does NOT support include_paths
588+
(monorepo subset selection). Use the HTTP async endpoint instead:
589+
POST /repos/{id}/index/async with IndexConfig body.
590+
This handler is the older pattern -- kept for backward compatibility.
591+
"""
504592
user = await _authenticate_websocket(websocket)
505593
if not user:
506594
return

backend/services/indexer_optimized.py

Lines changed: 54 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -117,30 +117,44 @@ def _detect_language(self, file_path: str) -> Optional[str]:
117117
}
118118
return lang_map.get(ext)
119119

120-
def _discover_code_files(self, repo_path: str) -> List[Path]:
121-
"""Find all code files in repository"""
120+
def _discover_code_files(
121+
self, repo_path: str, include_paths: Optional[List[str]] = None
122+
) -> List[Path]:
123+
"""Find all code files in repository.
124+
125+
Args:
126+
include_paths: If set, only include files under these relative
127+
directories (e.g. ['packages/effect', 'packages/schema']).
128+
Uses path-component-aware matching and only walks the
129+
specified subtrees instead of the entire repo.
130+
"""
122131
repo_path = Path(repo_path)
123132
code_files = []
124-
125-
# Extensions to index
133+
126134
extensions = {'.py', '.js', '.jsx', '.ts', '.tsx'}
127-
128-
# Directories to skip
129135
skip_dirs = {'node_modules', '.git', '__pycache__', 'venv', 'env', 'dist', 'build', '.next', '.vscode'}
130-
131-
for file_path in repo_path.rglob('*'):
132-
# Skip directories
133-
if file_path.is_dir():
134-
continue
135-
136-
# Skip if in excluded directory
137-
if any(skip in file_path.parts for skip in skip_dirs):
138-
continue
139-
140-
# Check extension
141-
if file_path.suffix in extensions:
142-
code_files.append(file_path)
143-
136+
137+
# When include_paths is set, only walk those subtrees
138+
if include_paths:
139+
roots = []
140+
for p in include_paths:
141+
subtree = repo_path / p
142+
if subtree.is_dir():
143+
roots.append(subtree)
144+
else:
145+
logger.warning("include_path not found, skipping: %s", p)
146+
else:
147+
roots = [repo_path]
148+
149+
for root in roots:
150+
for file_path in root.rglob('*'):
151+
if file_path.is_dir():
152+
continue
153+
if any(skip in file_path.parts for skip in skip_dirs):
154+
continue
155+
if file_path.suffix in extensions:
156+
code_files.append(file_path)
157+
144158
return code_files
145159

146160
async def _create_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
@@ -349,11 +363,16 @@ async def _extract_functions_from_file(
349363
logger.error("Error processing file", file_path=file_path, error=str(e))
350364
return []
351365

352-
def extract_functions_v2(self, repo_path: str, max_functions: int = 5000) -> List[ExtractedFunction]:
366+
def extract_functions_v2(
367+
self, repo_path: str, max_functions: int = 5000,
368+
include_paths: Optional[List[str]] = None,
369+
) -> List[ExtractedFunction]:
353370
"""Extract and filter functions using tree-sitter."""
354371
from pathlib import Path
355372

356-
raw = self.tree_sitter_extractor.extract_from_repo(Path(repo_path), max_functions=max_functions)
373+
raw = self.tree_sitter_extractor.extract_from_repo(
374+
Path(repo_path), include_paths=include_paths, max_functions=max_functions,
375+
)
357376
filtered = self.function_filter.filter_functions(raw)
358377

359378
logger.info("V2 extraction", total=len(raw), kept=len(filtered))
@@ -397,15 +416,17 @@ async def index_repository_v2(
397416
repo_id: str,
398417
repo_path: str,
399418
progress_callback=None,
400-
generate_summaries: bool = False
419+
generate_summaries: bool = False,
420+
include_paths: Optional[List[str]] = None,
401421
) -> int:
402422
"""Index repository using V2 function-level extraction."""
403423
from services.search_v2 import generate_summaries as gen_summaries
404424

405425
start_time = time.time()
406-
logger.info("V2 indexing started", repo_id=repo_id, with_summaries=generate_summaries)
426+
logger.info("V2 indexing started", repo_id=repo_id, with_summaries=generate_summaries,
427+
include_paths=include_paths)
407428

408-
functions = self.extract_functions_v2(repo_path)
429+
functions = self.extract_functions_v2(repo_path, include_paths=include_paths)
409430
if not functions:
410431
if progress_callback:
411432
await progress_callback(0, 0, 0)
@@ -691,18 +712,21 @@ async def index_repository_with_progress(
691712
repo_id: str,
692713
repo_path: str,
693714
progress_callback,
694-
max_files: int = None
695-
):
715+
max_files: int = None,
716+
include_paths: Optional[List[str]] = None,
717+
) -> int:
696718
"""Index repository with real-time progress updates
697719
698720
Args:
699721
max_files: If set, limit indexing to first N files (for partial indexing)
722+
include_paths: If set, only index files under these directories
700723
"""
701724
start_time = time.time()
702-
logger.info("Starting optimized indexing with progress", repo_id=repo_id)
725+
logger.info("Starting optimized indexing with progress", repo_id=repo_id,
726+
include_paths=include_paths)
703727

704-
# Discover code files
705-
code_files = self._discover_code_files(repo_path)
728+
# Discover code files (filtered by include_paths if set)
729+
code_files = self._discover_code_files(repo_path, include_paths=include_paths)
706730

707731
# Apply file limit if specified (partial indexing)
708732
if max_files and len(code_files) > max_files:

0 commit comments

Comments
 (0)