11"""Repository management routes - CRUD and indexing."""
22from fastapi import APIRouter , HTTPException , WebSocket , WebSocketDisconnect , Depends , BackgroundTasks
3- from pydantic import BaseModel
4- from typing import Optional
3+ from pydantic import BaseModel , field_validator
4+ from typing import List , Optional
5+ from pathlib import Path
56import hashlib
67import time
78import asyncio
@@ -177,6 +178,56 @@ async def delete_repository(
177178 raise HTTPException (status_code = 500 , detail = "Failed to delete repository" )
178179
179180
181+ def _scan_directories (local_path : Path ) -> List [dict ]:
182+ """Scan top-level directories and count code files in each.
183+
184+ Runs synchronously -- call via asyncio.to_thread() from async handlers
185+ to avoid blocking the event loop on large repos.
186+ """
187+ skip = {"node_modules" , ".git" , "__pycache__" , "venv" , ".next" , "dist" , "build" }
188+ extensions = {".py" , ".js" , ".jsx" , ".ts" , ".tsx" }
189+ dirs = []
190+ for item in sorted (local_path .iterdir ()):
191+ if item .is_dir () and item .name not in skip and not item .name .startswith ("." ):
192+ file_count = sum (
193+ 1 for f in item .rglob ("*" )
194+ if f .is_file () and f .suffix in extensions
195+ and not any (s in f .parts for s in skip )
196+ )
197+ dirs .append ({
198+ "name" : item .name ,
199+ "path" : str (item .relative_to (local_path )),
200+ "file_count" : file_count ,
201+ })
202+ return dirs
203+
204+
205+ @router .get ("/{repo_id}/directories" )
206+ async def get_repo_directories (
207+ repo_id : str ,
208+ auth : AuthContext = Depends (require_auth ),
209+ ) -> dict :
210+ """Return the top-level directory tree of a cloned repo.
211+
212+ Used for monorepo subset selection -- lets the user pick which
213+ directories to index instead of the entire repo.
214+ """
215+ repo = get_repo_or_404 (repo_id , auth .user_id )
216+ local_path = Path (repo ["local_path" ])
217+
218+ if not local_path .exists ():
219+ raise HTTPException (status_code = 404 , detail = "Repo not cloned yet" )
220+
221+ dirs = await asyncio .to_thread (_scan_directories , local_path )
222+
223+ return {
224+ "repo_id" : repo_id ,
225+ "repo_name" : repo .get ("name" , local_path .name ),
226+ "directories" : dirs ,
227+ "total_directories" : len (dirs ),
228+ }
229+
230+
180231@router .post ("/{repo_id}/index" )
181232async def index_repository (
182233 repo_id : str ,
@@ -275,7 +326,8 @@ async def _run_async_indexing(
275326 repo_id : str ,
276327 repo : dict ,
277328 user_id : str ,
278- incremental : bool = True
329+ incremental : bool = True ,
330+ include_paths : Optional [List [str ]] = None ,
279331):
280332 """
281333 Background task for async indexing with real-time progress.
@@ -298,9 +350,12 @@ async def _run_async_indexing(
298350 publisher .publish_progress (repo_id , 0 , 1 , 0 , "Starting..." )
299351
300352 # Check for incremental
353+ # Skip incremental when include_paths is set -- incremental_index_repository
354+ # uses git diff which doesn't understand subset boundaries
301355 last_commit = repo_manager .get_last_indexed_commit (repo_id )
356+ can_incremental = incremental and last_commit and not include_paths
302357
303- if incremental and last_commit :
358+ if can_incremental :
304359 logger .info ("Async INCREMENTAL indexing" , repo_id = repo_id , last_commit = last_commit [:8 ])
305360 total_functions = await indexer .incremental_index_repository (
306361 repo_id ,
@@ -349,7 +404,8 @@ async def progress_callback(
349404 total_functions = await indexer .index_repository_with_progress (
350405 repo_id ,
351406 repo ["local_path" ],
352- progress_callback
407+ progress_callback ,
408+ include_paths = include_paths ,
353409 )
354410 total_files = tracked_total_files
355411 index_type = "full"
@@ -400,11 +456,35 @@ async def progress_callback(
400456 )
401457
402458
459+ class IndexConfig (BaseModel ):
460+ """Optional config for indexing -- supports monorepo subset selection."""
461+ include_paths : Optional [List [str ]] = None # e.g. ["packages/effect", "packages/schema"]
462+ incremental : bool = True
463+
464+ @field_validator ("include_paths" , mode = "before" )
465+ @classmethod
466+ def sanitize_paths (cls , v : Optional [List [str ]]) -> Optional [List [str ]]:
467+ """Reject path traversal, empty strings, and normalize slashes."""
468+ if v is None :
469+ return v
470+ cleaned = []
471+ for item in v :
472+ if not isinstance (item , str ):
473+ raise ValueError (f"include_paths entries must be strings, got { type (item ).__name__ } " )
474+ item = item .replace ("\\ " , "/" ).strip ().strip ("/" )
475+ if not item :
476+ raise ValueError ("include_paths entries must not be empty" )
477+ if ".." in item .split ("/" ):
478+ raise ValueError (f"Path traversal not allowed: { item } " )
479+ cleaned .append (item )
480+ return cleaned
481+
482+
403483@router .post ("/{repo_id}/index/async" , status_code = 202 )
404484async def index_repository_async (
405485 repo_id : str ,
406486 background_tasks : BackgroundTasks ,
407- incremental : bool = True ,
487+ config : IndexConfig = IndexConfig () ,
408488 auth : AuthContext = Depends (require_auth )
409489):
410490 """
@@ -463,14 +543,16 @@ async def index_repository_async(
463543 repo_id ,
464544 repo ,
465545 user_id ,
466- incremental
546+ incremental = config .incremental ,
547+ include_paths = config .include_paths ,
467548 )
468549
469550 return {
470551 "status" : "indexing" ,
471552 "repo_id" : repo_id ,
472553 "message" : "Indexing started. Connect to WebSocket for progress." ,
473- "websocket_url" : f"/api/v1/ws/repos/{ repo_id } /indexing"
554+ "websocket_url" : f"/api/v1/ws/repos/{ repo_id } /indexing" ,
555+ "include_paths" : config .include_paths ,
474556 }
475557
476558 except HTTPException :
@@ -500,7 +582,13 @@ async def _authenticate_websocket(websocket: WebSocket) -> Optional[dict]:
500582# Note: WebSocket routes need to be registered on the main app, not router
501583# This function is exported and called from main.py
502584async def websocket_index (websocket : WebSocket , repo_id : str ):
503- """Real-time repository indexing with progress updates."""
585+ """Real-time repository indexing with progress updates.
586+
587+ NOTE: This WebSocket-direct-indexing path does NOT support include_paths
588+ (monorepo subset selection). Use the HTTP async endpoint instead:
589+ POST /repos/{id}/index/async with IndexConfig body.
590+ This handler is the older pattern -- kept for backward compatibility.
591+ """
504592 user = await _authenticate_websocket (websocket )
505593 if not user :
506594 return
0 commit comments