From 781fdd22d2d90995bb0c5b2a38f018bdade5a858 Mon Sep 17 00:00:00 2001
From: Devanshu Rajesh Chicholikar <chicholikar.d@northeastern.edu>
Date: Sat, 28 Feb 2026 11:50:30 -0500
Subject: [PATCH 1/5] feat: monorepo subset indexing -- include_paths support
 (OPE-106)

Thread include_paths from API through entire indexing chain:
- POST /repos/{id}/index/async accepts IndexConfig with include_paths
- _run_async_indexing passes include_paths to indexer
- indexer._discover_code_files filters files by include_paths
- indexer.extract_functions_v2 passes include_paths to tree-sitter
- indexer.index_repository_with_progress passes include_paths to discovery
- tree_sitter_extractor.extract_from_repo already supported include_paths

New endpoint:
- GET /repos/{id}/directories returns top-level dirs with file counts
  (for UI directory picker in monorepo subset selection)

Requested by Trevor Keith (Solid/trysolid.com) for Effect-TS monorepo
(200K+ functions across 30+ packages, only needs 2).
---
 backend/routes/repos.py               | 64 ++++++++++++++++++++++++---
 backend/services/indexer_optimized.py | 52 ++++++++++++++--------
 2 files changed, 91 insertions(+), 25 deletions(-)

diff --git a/backend/routes/repos.py b/backend/routes/repos.py
index 80bc709..7204bc5 100644
--- a/backend/routes/repos.py
+++ b/backend/routes/repos.py
@@ -1,7 +1,8 @@
 """Repository management routes - CRUD and indexing."""
 from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect, Depends, BackgroundTasks
 from pydantic import BaseModel
-from typing import Optional
+from typing import List, Optional
+from pathlib import Path
 import hashlib
 import time
 import asyncio
@@ -177,6 +178,47 @@ async def delete_repository(
         raise HTTPException(status_code=500, detail="Failed to delete repository")
 
 
+@router.get("/{repo_id}/directories")
+async def get_repo_directories(
+    repo_id: str,
+    auth: AuthContext = Depends(require_auth),
+):
+    """Return the top-level directory tree of a cloned repo.
+
+    Used for monorepo subset selection -- lets the user pick which
+    directories to index instead of the entire repo.
+    """
+    repo = get_repo_or_404(repo_id, auth.user_id)
+    local_path = Path(repo["local_path"])
+
+    if not local_path.exists():
+        raise HTTPException(status_code=404, detail="Repo not cloned yet")
+
+    skip = {"node_modules", ".git", "__pycache__", "venv", ".next", "dist", "build"}
+    dirs = []
+    for item in sorted(local_path.iterdir()):
+        if item.is_dir() and item.name not in skip and not item.name.startswith("."):
+            # count code files in this directory
+            extensions = {".py", ".js", ".jsx", ".ts", ".tsx"}
+            file_count = sum(
+                1 for f in item.rglob("*")
+                if f.is_file() and f.suffix in extensions
+                and not any(s in f.parts for s in skip)
+            )
+            dirs.append({
+                "name": item.name,
+                "path": str(item.relative_to(local_path)),
+                "file_count": file_count,
+            })
+
+    return {
+        "repo_id": repo_id,
+        "repo_name": repo.get("name", local_path.name),
+        "directories": dirs,
+        "total_directories": len(dirs),
+    }
+
+
 @router.post("/{repo_id}/index")
 async def index_repository(
     repo_id: str,
@@ -275,7 +317,8 @@ async def _run_async_indexing(
     repo_id: str,
     repo: dict,
     user_id: str,
-    incremental: bool = True
+    incremental: bool = True,
+    include_paths: Optional[List[str]] = None,
 ):
     """
     Background task for async indexing with real-time progress.
@@ -349,7 +392,8 @@ async def progress_callback(
             total_functions = await indexer.index_repository_with_progress(
                 repo_id,
                 repo["local_path"],
-                progress_callback
+                progress_callback,
+                include_paths=include_paths,
             )
             total_files = tracked_total_files
             index_type = "full"
@@ -400,11 +444,17 @@ async def progress_callback(
             )
 
 
+class IndexConfig(BaseModel):
+    """Optional config for indexing -- supports monorepo subset selection."""
+    include_paths: Optional[List[str]] = None  # e.g. ["packages/effect", "packages/schema"]
+    incremental: bool = True
+
+
 @router.post("/{repo_id}/index/async", status_code=202)
 async def index_repository_async(
     repo_id: str,
     background_tasks: BackgroundTasks,
-    incremental: bool = True,
+    config: IndexConfig = IndexConfig(),
     auth: AuthContext = Depends(require_auth)
 ):
     """
@@ -463,14 +513,16 @@ async def index_repository_async(
             repo_id,
             repo,
             user_id,
-            incremental
+            incremental=config.incremental,
+            include_paths=config.include_paths,
         )
         
         return {
             "status": "indexing",
             "repo_id": repo_id,
             "message": "Indexing started. Connect to WebSocket for progress.",
-            "websocket_url": f"/api/v1/ws/repos/{repo_id}/indexing"
+            "websocket_url": f"/api/v1/ws/repos/{repo_id}/indexing",
+            "include_paths": config.include_paths,
         }
         
     except HTTPException:
diff --git a/backend/services/indexer_optimized.py b/backend/services/indexer_optimized.py
index d8be0ed..9e77fd8 100644
--- a/backend/services/indexer_optimized.py
+++ b/backend/services/indexer_optimized.py
@@ -117,28 +117,32 @@ def _detect_language(self, file_path: str) -> Optional[str]:
         }
         return lang_map.get(ext)
     
-    def _discover_code_files(self, repo_path: str) -> List[Path]:
-        """Find all code files in repository"""
+    def _discover_code_files(
+        self, repo_path: str, include_paths: Optional[List[str]] = None
+    ) -> List[Path]:
+        """Find all code files in repository.
+
+        Args:
+            include_paths: If set, only include files under these relative
+                directories (e.g. ['packages/effect', 'packages/schema']).
+        """
         repo_path = Path(repo_path)
         code_files = []
         
-        # Extensions to index
         extensions = {'.py', '.js', '.jsx', '.ts', '.tsx'}
-        
-        # Directories to skip
         skip_dirs = {'node_modules', '.git', '__pycache__', 'venv', 'env', 'dist', 'build', '.next', '.vscode'}
         
         for file_path in repo_path.rglob('*'):
-            # Skip directories
             if file_path.is_dir():
                 continue
-            
-            # Skip if in excluded directory
             if any(skip in file_path.parts for skip in skip_dirs):
                 continue
-            
-            # Check extension
             if file_path.suffix in extensions:
+                # Subset filter: only include files under specified paths
+                if include_paths:
+                    rel = str(file_path.relative_to(repo_path))
+                    if not any(rel.startswith(p) for p in include_paths):
+                        continue
                 code_files.append(file_path)
         
         return code_files
@@ -349,11 +353,16 @@ async def _extract_functions_from_file(
             logger.error("Error processing file", file_path=file_path, error=str(e))
             return []
 
-    def extract_functions_v2(self, repo_path: str, max_functions: int = 5000) -> List[ExtractedFunction]:
+    def extract_functions_v2(
+        self, repo_path: str, max_functions: int = 5000,
+        include_paths: Optional[List[str]] = None,
+    ) -> List[ExtractedFunction]:
         """Extract and filter functions using tree-sitter."""
         from pathlib import Path
 
-        raw = self.tree_sitter_extractor.extract_from_repo(Path(repo_path), max_functions=max_functions)
+        raw = self.tree_sitter_extractor.extract_from_repo(
+            Path(repo_path), include_paths=include_paths, max_functions=max_functions,
+        )
         filtered = self.function_filter.filter_functions(raw)
 
         logger.info("V2 extraction", total=len(raw), kept=len(filtered))
@@ -397,15 +406,17 @@ async def index_repository_v2(
         repo_id: str,
         repo_path: str,
         progress_callback=None,
-        generate_summaries: bool = False
+        generate_summaries: bool = False,
+        include_paths: Optional[List[str]] = None,
     ) -> int:
         """Index repository using V2 function-level extraction."""
         from services.search_v2 import generate_summaries as gen_summaries
 
         start_time = time.time()
-        logger.info("V2 indexing started", repo_id=repo_id, with_summaries=generate_summaries)
+        logger.info("V2 indexing started", repo_id=repo_id, with_summaries=generate_summaries,
+                    include_paths=include_paths)
 
-        functions = self.extract_functions_v2(repo_path)
+        functions = self.extract_functions_v2(repo_path, include_paths=include_paths)
         if not functions:
             if progress_callback:
                 await progress_callback(0, 0, 0)
@@ -691,18 +702,21 @@ async def index_repository_with_progress(
         repo_id: str,
         repo_path: str,
         progress_callback,
-        max_files: int = None
+        max_files: int = None,
+        include_paths: Optional[List[str]] = None,
     ):
         """Index repository with real-time progress updates
 
         Args:
             max_files: If set, limit indexing to first N files (for partial indexing)
+            include_paths: If set, only index files under these directories
         """
         start_time = time.time()
-        logger.info("Starting optimized indexing with progress", repo_id=repo_id)
+        logger.info("Starting optimized indexing with progress", repo_id=repo_id,
+                    include_paths=include_paths)
 
-        # Discover code files
-        code_files = self._discover_code_files(repo_path)
+        # Discover code files (filtered by include_paths if set)
+        code_files = self._discover_code_files(repo_path, include_paths=include_paths)
 
         # Apply file limit if specified (partial indexing)
         if max_files and len(code_files) > max_files:

From 022c2979ad33d8006b2fe443d1a2a149543cbce3 Mon Sep 17 00:00:00 2001
From: Devanshu Rajesh Chicholikar <chicholikar.d@northeastern.edu>
Date: Sat, 28 Feb 2026 12:13:33 -0500
Subject: [PATCH 2/5] fix: address PR review -- 4 verified findings

1. Add -> int return annotation to index_repository_with_progress
2. Skip incremental indexing when include_paths is set (incremental_index_repository
   uses git diff which doesn't understand subset boundaries)
3. Move blocking rglob() out of async handler into _scan_directories() sync helper,
   called via asyncio.to_thread() to avoid blocking the event loop on large repos
4. Replace full-repo rglob + startswith filter in _discover_code_files with targeted
   subtree walking (only iterate include_paths dirs) -- eliminates prefix collision
   risk (packages/effectx no longer matches packages/effect)
---
 backend/routes/repos.py               | 46 +++++++++++++++++----------
 backend/services/indexer_optimized.py | 42 ++++++++++++++----------
 2 files changed, 55 insertions(+), 33 deletions(-)

diff --git a/backend/routes/repos.py b/backend/routes/repos.py
index 7204bc5..59cafd2 100644
--- a/backend/routes/repos.py
+++ b/backend/routes/repos.py
@@ -178,28 +178,17 @@ async def delete_repository(
         raise HTTPException(status_code=500, detail="Failed to delete repository")
 
 
-@router.get("/{repo_id}/directories")
-async def get_repo_directories(
-    repo_id: str,
-    auth: AuthContext = Depends(require_auth),
-):
-    """Return the top-level directory tree of a cloned repo.
+def _scan_directories(local_path: Path) -> list:
+    """Scan top-level directories and count code files in each.
 
-    Used for monorepo subset selection -- lets the user pick which
-    directories to index instead of the entire repo.
+    Runs synchronously -- call via asyncio.to_thread() from async handlers
+    to avoid blocking the event loop on large repos.
     """
-    repo = get_repo_or_404(repo_id, auth.user_id)
-    local_path = Path(repo["local_path"])
-
-    if not local_path.exists():
-        raise HTTPException(status_code=404, detail="Repo not cloned yet")
-
     skip = {"node_modules", ".git", "__pycache__", "venv", ".next", "dist", "build"}
+    extensions = {".py", ".js", ".jsx", ".ts", ".tsx"}
     dirs = []
     for item in sorted(local_path.iterdir()):
         if item.is_dir() and item.name not in skip and not item.name.startswith("."):
-            # count code files in this directory
-            extensions = {".py", ".js", ".jsx", ".ts", ".tsx"}
             file_count = sum(
                 1 for f in item.rglob("*")
                 if f.is_file() and f.suffix in extensions
@@ -210,6 +199,26 @@ async def get_repo_directories(
                 "path": str(item.relative_to(local_path)),
                 "file_count": file_count,
             })
+    return dirs
+
+
+@router.get("/{repo_id}/directories")
+async def get_repo_directories(
+    repo_id: str,
+    auth: AuthContext = Depends(require_auth),
+):
+    """Return the top-level directory tree of a cloned repo.
+
+    Used for monorepo subset selection -- lets the user pick which
+    directories to index instead of the entire repo.
+    """
+    repo = get_repo_or_404(repo_id, auth.user_id)
+    local_path = Path(repo["local_path"])
+
+    if not local_path.exists():
+        raise HTTPException(status_code=404, detail="Repo not cloned yet")
+
+    dirs = await asyncio.to_thread(_scan_directories, local_path)
 
     return {
         "repo_id": repo_id,
@@ -341,9 +350,12 @@ async def _run_async_indexing(
             publisher.publish_progress(repo_id, 0, 1, 0, "Starting...")
         
         # Check for incremental
+        # Skip incremental when include_paths is set -- incremental_index_repository
+        # uses git diff which doesn't understand subset boundaries
         last_commit = repo_manager.get_last_indexed_commit(repo_id)
+        can_incremental = incremental and last_commit and not include_paths
         
-        if incremental and last_commit:
+        if can_incremental:
             logger.info("Async INCREMENTAL indexing", repo_id=repo_id, last_commit=last_commit[:8])
             total_functions = await indexer.incremental_index_repository(
                 repo_id,
diff --git a/backend/services/indexer_optimized.py b/backend/services/indexer_optimized.py
index 9e77fd8..60f9486 100644
--- a/backend/services/indexer_optimized.py
+++ b/backend/services/indexer_optimized.py
@@ -125,26 +125,36 @@ def _discover_code_files(
         Args:
             include_paths: If set, only include files under these relative
                 directories (e.g. ['packages/effect', 'packages/schema']).
+                Uses path-component-aware matching and only walks the
+                specified subtrees instead of the entire repo.
         """
         repo_path = Path(repo_path)
         code_files = []
-        
+
         extensions = {'.py', '.js', '.jsx', '.ts', '.tsx'}
         skip_dirs = {'node_modules', '.git', '__pycache__', 'venv', 'env', 'dist', 'build', '.next', '.vscode'}
-        
-        for file_path in repo_path.rglob('*'):
-            if file_path.is_dir():
-                continue
-            if any(skip in file_path.parts for skip in skip_dirs):
-                continue
-            if file_path.suffix in extensions:
-                # Subset filter: only include files under specified paths
-                if include_paths:
-                    rel = str(file_path.relative_to(repo_path))
-                    if not any(rel.startswith(p) for p in include_paths):
-                        continue
-                code_files.append(file_path)
-        
+
+        # When include_paths is set, only walk those subtrees
+        if include_paths:
+            roots = []
+            for p in include_paths:
+                subtree = repo_path / p
+                if subtree.is_dir():
+                    roots.append(subtree)
+                else:
+                    logger.warning("include_path not found, skipping: %s", p)
+        else:
+            roots = [repo_path]
+
+        for root in roots:
+            for file_path in root.rglob('*'):
+                if file_path.is_dir():
+                    continue
+                if any(skip in file_path.parts for skip in skip_dirs):
+                    continue
+                if file_path.suffix in extensions:
+                    code_files.append(file_path)
+
         return code_files
     
     async def _create_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
@@ -704,7 +714,7 @@ async def index_repository_with_progress(
         progress_callback,
         max_files: int = None,
         include_paths: Optional[List[str]] = None,
-    ):
+    ) -> int:
         """Index repository with real-time progress updates
 
         Args:

From fb0a985513f28ea0c102d259cf42d5c2e9f42d1a Mon Sep 17 00:00:00 2001
From: Devanshu Rajesh Chicholikar <chicholikar.d@northeastern.edu>
Date: Sat, 28 Feb 2026 16:04:44 -0500
Subject: [PATCH 3/5] fix: address PR review round 2 -- type annotations, path
 validation, docs

1. Add -> dict return type to get_repo_directories
2. Add -> List[dict] return type to _scan_directories
3. Add Pydantic validator on IndexConfig.include_paths:
   - Rejects empty strings
   - Rejects path traversal (.. segments)
   - Normalizes leading/trailing slashes
4. Add docstring to websocket_index explaining it does NOT support
   include_paths (older pattern, use HTTP async endpoint instead)

All 4 findings verified against code before fixing.
---
 backend/routes/repos.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/backend/routes/repos.py b/backend/routes/repos.py
index 59cafd2..cd96d84 100644
--- a/backend/routes/repos.py
+++ b/backend/routes/repos.py
@@ -1,6 +1,6 @@
 """Repository management routes - CRUD and indexing."""
 from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect, Depends, BackgroundTasks
-from pydantic import BaseModel
+from pydantic import BaseModel, validator
 from typing import List, Optional
 from pathlib import Path
 import hashlib
@@ -178,7 +178,7 @@ async def delete_repository(
         raise HTTPException(status_code=500, detail="Failed to delete repository")
 
 
-def _scan_directories(local_path: Path) -> list:
+def _scan_directories(local_path: Path) -> List[dict]:
     """Scan top-level directories and count code files in each.
 
     Runs synchronously -- call via asyncio.to_thread() from async handlers
@@ -206,7 +206,7 @@ def _scan_directories(local_path: Path) -> list:
 async def get_repo_directories(
     repo_id: str,
     auth: AuthContext = Depends(require_auth),
-):
+) -> dict:
     """Return the top-level directory tree of a cloned repo.
 
     Used for monorepo subset selection -- lets the user pick which
@@ -461,6 +461,17 @@ class IndexConfig(BaseModel):
     include_paths: Optional[List[str]] = None  # e.g. ["packages/effect", "packages/schema"]
     incremental: bool = True
 
+    @validator("include_paths", each_item=True, pre=True)
+    @classmethod
+    def sanitize_path(cls, v: str) -> str:
+        """Reject path traversal, empty strings, and normalize slashes."""
+        v = v.strip().strip("/")
+        if not v:
+            raise ValueError("include_paths entries must not be empty")
+        if ".." in v.split("/"):
+            raise ValueError(f"Path traversal not allowed: {v}")
+        return v
+
 
 @router.post("/{repo_id}/index/async", status_code=202)
 async def index_repository_async(
@@ -564,7 +575,13 @@ async def _authenticate_websocket(websocket: WebSocket) -> Optional[dict]:
 # Note: WebSocket routes need to be registered on the main app, not router
 # This function is exported and called from main.py
 async def websocket_index(websocket: WebSocket, repo_id: str):
-    """Real-time repository indexing with progress updates."""
+    """Real-time repository indexing with progress updates.
+
+    NOTE: This WebSocket-direct-indexing path does NOT support include_paths
+    (monorepo subset selection). Use the HTTP async endpoint instead:
+    POST /repos/{id}/index/async with IndexConfig body.
+    This handler is the older pattern -- kept for backward compatibility.
+    """
     user = await _authenticate_websocket(websocket)
     if not user:
         return

From 6c145150789109a8708413139df196635455924a Mon Sep 17 00:00:00 2001
From: Devanshu Rajesh Chicholikar <chicholikar.d@northeastern.edu>
Date: Sat, 28 Feb 2026 16:26:09 -0500
Subject: [PATCH 4/5] fix: migrate IndexConfig validator from Pydantic v1 to v2

Project requires pydantic>=2.0.0 and uses field_validator elsewhere
(playground routes). Replace deprecated @validator with @field_validator:
- mode='before' instead of pre=True
- Iterate list manually instead of each_item=True (removed in v2)
- Explicit isinstance check for non-string input
- Same security behavior: rejects empty, traversal, normalizes slashes
---
 backend/routes/repos.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/backend/routes/repos.py b/backend/routes/repos.py
index cd96d84..2619e6c 100644
--- a/backend/routes/repos.py
+++ b/backend/routes/repos.py
@@ -1,6 +1,6 @@
 """Repository management routes - CRUD and indexing."""
 from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect, Depends, BackgroundTasks
-from pydantic import BaseModel, validator
+from pydantic import BaseModel, field_validator
 from typing import List, Optional
 from pathlib import Path
 import hashlib
@@ -461,16 +461,23 @@ class IndexConfig(BaseModel):
     include_paths: Optional[List[str]] = None  # e.g. ["packages/effect", "packages/schema"]
     incremental: bool = True
 
-    @validator("include_paths", each_item=True, pre=True)
+    @field_validator("include_paths", mode="before")
     @classmethod
-    def sanitize_path(cls, v: str) -> str:
+    def sanitize_paths(cls, v: Optional[List[str]]) -> Optional[List[str]]:
         """Reject path traversal, empty strings, and normalize slashes."""
-        v = v.strip().strip("/")
-        if not v:
-            raise ValueError("include_paths entries must not be empty")
-        if ".." in v.split("/"):
-            raise ValueError(f"Path traversal not allowed: {v}")
-        return v
+        if v is None:
+            return v
+        cleaned = []
+        for item in v:
+            if not isinstance(item, str):
+                raise ValueError(f"include_paths entries must be strings, got {type(item).__name__}")
+            item = item.strip().strip("/")
+            if not item:
+                raise ValueError("include_paths entries must not be empty")
+            if ".." in item.split("/"):
+                raise ValueError(f"Path traversal not allowed: {item}")
+            cleaned.append(item)
+        return cleaned
 
 
 @router.post("/{repo_id}/index/async", status_code=202)

From f4d61c9ee06c00ec8263c182ac877d43993dcd67 Mon Sep 17 00:00:00 2001
From: Devanshu Rajesh Chicholikar <chicholikar.d@northeastern.edu>
Date: Sat, 28 Feb 2026 16:52:36 -0500
Subject: [PATCH 5/5] fix: normalize backslashes in include_paths validator

Defensive measure -- converts Windows-style backslashes to forward
slashes before validation. Costs nothing, prevents edge cases if
infra or client platform changes.
---
 backend/routes/repos.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/routes/repos.py b/backend/routes/repos.py
index 2619e6c..f3f99a4 100644
--- a/backend/routes/repos.py
+++ b/backend/routes/repos.py
@@ -471,7 +471,7 @@ def sanitize_paths(cls, v: Optional[List[str]]) -> Optional[List[str]]:
         for item in v:
             if not isinstance(item, str):
                 raise ValueError(f"include_paths entries must be strings, got {type(item).__name__}")
-            item = item.strip().strip("/")
+            item = item.replace("\\", "/").strip().strip("/")
             if not item:
                 raise ValueError("include_paths entries must not be empty")
             if ".." in item.split("/"):