From 781fdd22d2d90995bb0c5b2a38f018bdade5a858 Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Sat, 28 Feb 2026 11:50:30 -0500 Subject: [PATCH 1/5] feat: monorepo subset indexing -- include_paths support (OPE-106) Thread include_paths from API through entire indexing chain: - POST /repos/{id}/index/async accepts IndexConfig with include_paths - _run_async_indexing passes include_paths to indexer - indexer._discover_code_files filters files by include_paths - indexer.extract_functions_v2 passes include_paths to tree-sitter - indexer.index_repository_with_progress passes include_paths to discovery - tree_sitter_extractor.extract_from_repo already supported include_paths New endpoint: - GET /repos/{id}/directories returns top-level dirs with file counts (for UI directory picker in monorepo subset selection) Requested by Trevor Keith (Solid/trysolid.com) for Effect-TS monorepo (200K+ functions across 30+ packages, only needs 2). --- backend/routes/repos.py | 64 ++++++++++++++++++++++++--- backend/services/indexer_optimized.py | 52 ++++++++++++++-------- 2 files changed, 91 insertions(+), 25 deletions(-) diff --git a/backend/routes/repos.py b/backend/routes/repos.py index 80bc709..7204bc5 100644 --- a/backend/routes/repos.py +++ b/backend/routes/repos.py @@ -1,7 +1,8 @@ """Repository management routes - CRUD and indexing.""" from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect, Depends, BackgroundTasks from pydantic import BaseModel -from typing import Optional +from typing import List, Optional +from pathlib import Path import hashlib import time import asyncio @@ -177,6 +178,47 @@ async def delete_repository( raise HTTPException(status_code=500, detail="Failed to delete repository") +@router.get("/{repo_id}/directories") +async def get_repo_directories( + repo_id: str, + auth: AuthContext = Depends(require_auth), +): + """Return the top-level directory tree of a cloned repo. + + Used for monorepo subset selection -- lets the user pick which + directories to index instead of the entire repo. + """ + repo = get_repo_or_404(repo_id, auth.user_id) + local_path = Path(repo["local_path"]) + + if not local_path.exists(): + raise HTTPException(status_code=404, detail="Repo not cloned yet") + + skip = {"node_modules", ".git", "__pycache__", "venv", ".next", "dist", "build"} + dirs = [] + for item in sorted(local_path.iterdir()): + if item.is_dir() and item.name not in skip and not item.name.startswith("."): + # count code files in this directory + extensions = {".py", ".js", ".jsx", ".ts", ".tsx"} + file_count = sum( + 1 for f in item.rglob("*") + if f.is_file() and f.suffix in extensions + and not any(s in f.parts for s in skip) + ) + dirs.append({ + "name": item.name, + "path": str(item.relative_to(local_path)), + "file_count": file_count, + }) + + return { + "repo_id": repo_id, + "repo_name": repo.get("name", local_path.name), + "directories": dirs, + "total_directories": len(dirs), + } + + @router.post("/{repo_id}/index") async def index_repository( repo_id: str, @@ -275,7 +317,8 @@ async def _run_async_indexing( repo_id: str, repo: dict, user_id: str, - incremental: bool = True + incremental: bool = True, + include_paths: Optional[List[str]] = None, ): """ Background task for async indexing with real-time progress. @@ -349,7 +392,8 @@ async def progress_callback( total_functions = await indexer.index_repository_with_progress( repo_id, repo["local_path"], - progress_callback + progress_callback, + include_paths=include_paths, ) total_files = tracked_total_files index_type = "full" @@ -400,11 +444,17 @@ async def progress_callback( ) +class IndexConfig(BaseModel): + """Optional config for indexing -- supports monorepo subset selection.""" + include_paths: Optional[List[str]] = None # e.g. ["packages/effect", "packages/schema"] + incremental: bool = True + + @router.post("/{repo_id}/index/async", status_code=202) async def index_repository_async( repo_id: str, background_tasks: BackgroundTasks, - incremental: bool = True, + config: IndexConfig = IndexConfig(), auth: AuthContext = Depends(require_auth) ): """ @@ -463,14 +513,16 @@ async def index_repository_async( repo_id, repo, user_id, - incremental + incremental=config.incremental, + include_paths=config.include_paths, ) return { "status": "indexing", "repo_id": repo_id, "message": "Indexing started. Connect to WebSocket for progress.", - "websocket_url": f"/api/v1/ws/repos/{repo_id}/indexing" + "websocket_url": f"/api/v1/ws/repos/{repo_id}/indexing", + "include_paths": config.include_paths, } except HTTPException: diff --git a/backend/services/indexer_optimized.py b/backend/services/indexer_optimized.py index d8be0ed..9e77fd8 100644 --- a/backend/services/indexer_optimized.py +++ b/backend/services/indexer_optimized.py @@ -117,28 +117,32 @@ def _detect_language(self, file_path: str) -> Optional[str]: } return lang_map.get(ext) - def _discover_code_files(self, repo_path: str) -> List[Path]: - """Find all code files in repository""" + def _discover_code_files( + self, repo_path: str, include_paths: Optional[List[str]] = None + ) -> List[Path]: + """Find all code files in repository. + + Args: + include_paths: If set, only include files under these relative + directories (e.g. ['packages/effect', 'packages/schema']). + """ repo_path = Path(repo_path) code_files = [] - # Extensions to index extensions = {'.py', '.js', '.jsx', '.ts', '.tsx'} - - # Directories to skip skip_dirs = {'node_modules', '.git', '__pycache__', 'venv', 'env', 'dist', 'build', '.next', '.vscode'} for file_path in repo_path.rglob('*'): - # Skip directories if file_path.is_dir(): continue - - # Skip if in excluded directory if any(skip in file_path.parts for skip in skip_dirs): continue - - # Check extension if file_path.suffix in extensions: + # Subset filter: only include files under specified paths + if include_paths: + rel = str(file_path.relative_to(repo_path)) + if not any(rel.startswith(p) for p in include_paths): + continue code_files.append(file_path) return code_files @@ -349,11 +353,16 @@ async def _extract_functions_from_file( logger.error("Error processing file", file_path=file_path, error=str(e)) return [] - def extract_functions_v2(self, repo_path: str, max_functions: int = 5000) -> List[ExtractedFunction]: + def extract_functions_v2( + self, repo_path: str, max_functions: int = 5000, + include_paths: Optional[List[str]] = None, + ) -> List[ExtractedFunction]: """Extract and filter functions using tree-sitter.""" from pathlib import Path - raw = self.tree_sitter_extractor.extract_from_repo(Path(repo_path), max_functions=max_functions) + raw = self.tree_sitter_extractor.extract_from_repo( + Path(repo_path), include_paths=include_paths, max_functions=max_functions, + ) filtered = self.function_filter.filter_functions(raw) logger.info("V2 extraction", total=len(raw), kept=len(filtered)) @@ -397,15 +406,17 @@ async def index_repository_v2( repo_id: str, repo_path: str, progress_callback=None, - generate_summaries: bool = False + generate_summaries: bool = False, + include_paths: Optional[List[str]] = None, ) -> int: """Index repository using V2 function-level extraction.""" from services.search_v2 import generate_summaries as gen_summaries start_time = time.time() - logger.info("V2 indexing started", repo_id=repo_id, with_summaries=generate_summaries) + logger.info("V2 indexing started", repo_id=repo_id, with_summaries=generate_summaries, + include_paths=include_paths) - functions = self.extract_functions_v2(repo_path) + functions = self.extract_functions_v2(repo_path, include_paths=include_paths) if not functions: if progress_callback: await progress_callback(0, 0, 0) @@ -691,18 +702,21 @@ async def index_repository_with_progress( repo_id: str, repo_path: str, progress_callback, - max_files: int = None + max_files: int = None, + include_paths: Optional[List[str]] = None, ): """Index repository with real-time progress updates Args: max_files: If set, limit indexing to first N files (for partial indexing) + include_paths: If set, only index files under these directories """ start_time = time.time() - logger.info("Starting optimized indexing with progress", repo_id=repo_id) + logger.info("Starting optimized indexing with progress", repo_id=repo_id, + include_paths=include_paths) - # Discover code files - code_files = self._discover_code_files(repo_path) + # Discover code files (filtered by include_paths if set) + code_files = self._discover_code_files(repo_path, include_paths=include_paths) # Apply file limit if specified (partial indexing) if max_files and len(code_files) > max_files: From 022c2979ad33d8006b2fe443d1a2a149543cbce3 Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Sat, 28 Feb 2026 12:13:33 -0500 Subject: [PATCH 2/5] fix: address PR review -- 4 verified findings 1. Add -> int return annotation to index_repository_with_progress 2. Skip incremental indexing when include_paths is set (incremental_index_repository uses git diff which doesn't understand subset boundaries) 3. Move blocking rglob() out of async handler into _scan_directories() sync helper, called via asyncio.to_thread() to avoid blocking the event loop on large repos 4. Replace full-repo rglob + startswith filter in _discover_code_files with targeted subtree walking (only iterate include_paths dirs) -- eliminates prefix collision risk (packages/effectx no longer matches packages/effect) --- backend/routes/repos.py | 46 +++++++++++++++++---------- backend/services/indexer_optimized.py | 42 ++++++++++++++---------- 2 files changed, 55 insertions(+), 33 deletions(-) diff --git a/backend/routes/repos.py b/backend/routes/repos.py index 7204bc5..59cafd2 100644 --- a/backend/routes/repos.py +++ b/backend/routes/repos.py @@ -178,28 +178,17 @@ async def delete_repository( raise HTTPException(status_code=500, detail="Failed to delete repository") -@router.get("/{repo_id}/directories") -async def get_repo_directories( - repo_id: str, - auth: AuthContext = Depends(require_auth), -): - """Return the top-level directory tree of a cloned repo. +def _scan_directories(local_path: Path) -> list: + """Scan top-level directories and count code files in each. - Used for monorepo subset selection -- lets the user pick which - directories to index instead of the entire repo. + Runs synchronously -- call via asyncio.to_thread() from async handlers + to avoid blocking the event loop on large repos. """ - repo = get_repo_or_404(repo_id, auth.user_id) - local_path = Path(repo["local_path"]) - - if not local_path.exists(): - raise HTTPException(status_code=404, detail="Repo not cloned yet") - skip = {"node_modules", ".git", "__pycache__", "venv", ".next", "dist", "build"} + extensions = {".py", ".js", ".jsx", ".ts", ".tsx"} dirs = [] for item in sorted(local_path.iterdir()): if item.is_dir() and item.name not in skip and not item.name.startswith("."): - # count code files in this directory - extensions = {".py", ".js", ".jsx", ".ts", ".tsx"} file_count = sum( 1 for f in item.rglob("*") if f.is_file() and f.suffix in extensions @@ -210,6 +199,26 @@ async def get_repo_directories( "path": str(item.relative_to(local_path)), "file_count": file_count, }) + return dirs + + +@router.get("/{repo_id}/directories") +async def get_repo_directories( + repo_id: str, + auth: AuthContext = Depends(require_auth), +): + """Return the top-level directory tree of a cloned repo. + + Used for monorepo subset selection -- lets the user pick which + directories to index instead of the entire repo. + """ + repo = get_repo_or_404(repo_id, auth.user_id) + local_path = Path(repo["local_path"]) + + if not local_path.exists(): + raise HTTPException(status_code=404, detail="Repo not cloned yet") + + dirs = await asyncio.to_thread(_scan_directories, local_path) return { "repo_id": repo_id, @@ -341,9 +350,12 @@ async def _run_async_indexing( publisher.publish_progress(repo_id, 0, 1, 0, "Starting...") # Check for incremental + # Skip incremental when include_paths is set -- incremental_index_repository + # uses git diff which doesn't understand subset boundaries last_commit = repo_manager.get_last_indexed_commit(repo_id) + can_incremental = incremental and last_commit and not include_paths - if incremental and last_commit: + if can_incremental: logger.info("Async INCREMENTAL indexing", repo_id=repo_id, last_commit=last_commit[:8]) total_functions = await indexer.incremental_index_repository( repo_id, diff --git a/backend/services/indexer_optimized.py b/backend/services/indexer_optimized.py index 9e77fd8..60f9486 100644 --- a/backend/services/indexer_optimized.py +++ b/backend/services/indexer_optimized.py @@ -125,26 +125,36 @@ def _discover_code_files( Args: include_paths: If set, only include files under these relative directories (e.g. ['packages/effect', 'packages/schema']). + Uses path-component-aware matching and only walks the + specified subtrees instead of the entire repo. """ repo_path = Path(repo_path) code_files = [] - + extensions = {'.py', '.js', '.jsx', '.ts', '.tsx'} skip_dirs = {'node_modules', '.git', '__pycache__', 'venv', 'env', 'dist', 'build', '.next', '.vscode'} - - for file_path in repo_path.rglob('*'): - if file_path.is_dir(): - continue - if any(skip in file_path.parts for skip in skip_dirs): - continue - if file_path.suffix in extensions: - # Subset filter: only include files under specified paths - if include_paths: - rel = str(file_path.relative_to(repo_path)) - if not any(rel.startswith(p) for p in include_paths): - continue - code_files.append(file_path) - + + # When include_paths is set, only walk those subtrees + if include_paths: + roots = [] + for p in include_paths: + subtree = repo_path / p + if subtree.is_dir(): + roots.append(subtree) + else: + logger.warning("include_path not found, skipping: %s", p) + else: + roots = [repo_path] + + for root in roots: + for file_path in root.rglob('*'): + if file_path.is_dir(): + continue + if any(skip in file_path.parts for skip in skip_dirs): + continue + if file_path.suffix in extensions: + code_files.append(file_path) + return code_files async def _create_embeddings_batch(self, texts: List[str]) -> List[List[float]]: @@ -704,7 +714,7 @@ async def index_repository_with_progress( progress_callback, max_files: int = None, include_paths: Optional[List[str]] = None, - ): + ) -> int: """Index repository with real-time progress updates Args: From fb0a985513f28ea0c102d259cf42d5c2e9f42d1a Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Sat, 28 Feb 2026 16:04:44 -0500 Subject: [PATCH 3/5] fix: address PR review round 2 -- type annotations, path validation, docs 1. Add -> dict return type to get_repo_directories 2. Add -> List[dict] return type to _scan_directories 3. Add Pydantic validator on IndexConfig.include_paths: - Rejects empty strings - Rejects path traversal (.. segments) - Normalizes leading/trailing slashes 4. Add docstring to websocket_index explaining it does NOT support include_paths (older pattern, use HTTP async endpoint instead) All 4 findings verified against code before fixing. --- backend/routes/repos.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/backend/routes/repos.py b/backend/routes/repos.py index 59cafd2..cd96d84 100644 --- a/backend/routes/repos.py +++ b/backend/routes/repos.py @@ -1,6 +1,6 @@ """Repository management routes - CRUD and indexing.""" from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect, Depends, BackgroundTasks -from pydantic import BaseModel +from pydantic import BaseModel, validator from typing import List, Optional from pathlib import Path import hashlib @@ -178,7 +178,7 @@ async def delete_repository( raise HTTPException(status_code=500, detail="Failed to delete repository") -def _scan_directories(local_path: Path) -> list: +def _scan_directories(local_path: Path) -> List[dict]: """Scan top-level directories and count code files in each. Runs synchronously -- call via asyncio.to_thread() from async handlers @@ -206,7 +206,7 @@ def _scan_directories(local_path: Path) -> list: async def get_repo_directories( repo_id: str, auth: AuthContext = Depends(require_auth), -): +) -> dict: """Return the top-level directory tree of a cloned repo. Used for monorepo subset selection -- lets the user pick which @@ -461,6 +461,17 @@ class IndexConfig(BaseModel): include_paths: Optional[List[str]] = None # e.g. ["packages/effect", "packages/schema"] incremental: bool = True + @validator("include_paths", each_item=True, pre=True) + @classmethod + def sanitize_path(cls, v: str) -> str: + """Reject path traversal, empty strings, and normalize slashes.""" + v = v.strip().strip("/") + if not v: + raise ValueError("include_paths entries must not be empty") + if ".." in v.split("/"): + raise ValueError(f"Path traversal not allowed: {v}") + return v + @router.post("/{repo_id}/index/async", status_code=202) async def index_repository_async( @@ -564,7 +575,13 @@ async def _authenticate_websocket(websocket: WebSocket) -> Optional[dict]: # Note: WebSocket routes need to be registered on the main app, not router # This function is exported and called from main.py async def websocket_index(websocket: WebSocket, repo_id: str): - """Real-time repository indexing with progress updates.""" + """Real-time repository indexing with progress updates. + + NOTE: This WebSocket-direct-indexing path does NOT support include_paths + (monorepo subset selection). Use the HTTP async endpoint instead: + POST /repos/{id}/index/async with IndexConfig body. + This handler is the older pattern -- kept for backward compatibility. + """ user = await _authenticate_websocket(websocket) if not user: return From 6c145150789109a8708413139df196635455924a Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Sat, 28 Feb 2026 16:26:09 -0500 Subject: [PATCH 4/5] fix: migrate IndexConfig validator from Pydantic v1 to v2 Project requires pydantic>=2.0.0 and uses field_validator elsewhere (playground routes). Replace deprecated @validator with @field_validator: - mode='before' instead of pre=True - Iterate list manually instead of each_item=True (removed in v2) - Explicit isinstance check for non-string input - Same security behavior: rejects empty, traversal, normalizes slashes --- backend/routes/repos.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/backend/routes/repos.py b/backend/routes/repos.py index cd96d84..2619e6c 100644 --- a/backend/routes/repos.py +++ b/backend/routes/repos.py @@ -1,6 +1,6 @@ """Repository management routes - CRUD and indexing.""" from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect, Depends, BackgroundTasks -from pydantic import BaseModel, validator +from pydantic import BaseModel, field_validator from typing import List, Optional from pathlib import Path import hashlib @@ -461,16 +461,23 @@ class IndexConfig(BaseModel): include_paths: Optional[List[str]] = None # e.g. ["packages/effect", "packages/schema"] incremental: bool = True - @validator("include_paths", each_item=True, pre=True) + @field_validator("include_paths", mode="before") @classmethod - def sanitize_path(cls, v: str) -> str: + def sanitize_paths(cls, v: Optional[List[str]]) -> Optional[List[str]]: """Reject path traversal, empty strings, and normalize slashes.""" - v = v.strip().strip("/") - if not v: - raise ValueError("include_paths entries must not be empty") - if ".." in v.split("/"): - raise ValueError(f"Path traversal not allowed: {v}") - return v + if v is None: + return v + cleaned = [] + for item in v: + if not isinstance(item, str): + raise ValueError(f"include_paths entries must be strings, got {type(item).__name__}") + item = item.strip().strip("/") + if not item: + raise ValueError("include_paths entries must not be empty") + if ".." in item.split("/"): + raise ValueError(f"Path traversal not allowed: {item}") + cleaned.append(item) + return cleaned @router.post("/{repo_id}/index/async", status_code=202) From f4d61c9ee06c00ec8263c182ac877d43993dcd67 Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Sat, 28 Feb 2026 16:52:36 -0500 Subject: [PATCH 5/5] fix: normalize backslashes in include_paths validator Defensive measure -- converts Windows-style backslashes to forward slashes before validation. Costs nothing, prevents edge cases if infra or client platform changes. --- backend/routes/repos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/routes/repos.py b/backend/routes/repos.py index 2619e6c..f3f99a4 100644 --- a/backend/routes/repos.py +++ b/backend/routes/repos.py @@ -471,7 +471,7 @@ def sanitize_paths(cls, v: Optional[List[str]]) -> Optional[List[str]]: for item in v: if not isinstance(item, str): raise ValueError(f"include_paths entries must be strings, got {type(item).__name__}") - item = item.strip().strip("/") + item = item.replace("\\", "/").strip().strip("/") if not item: raise ValueError("include_paths entries must not be empty") if ".." in item.split("/"):