Skip to content

Commit 781fdd2

Browse files
committed
feat: monorepo subset indexing -- include_paths support (OPE-106)
Thread include_paths from API through entire indexing chain: - POST /repos/{id}/index/async accepts IndexConfig with include_paths - _run_async_indexing passes include_paths to indexer - indexer._discover_code_files filters files by include_paths - indexer.extract_functions_v2 passes include_paths to tree-sitter - indexer.index_repository_with_progress passes include_paths to discovery - tree_sitter_extractor.extract_from_repo already supported include_paths New endpoint: - GET /repos/{id}/directories returns top-level dirs with file counts (for UI directory picker in monorepo subset selection) Requested by Trevor Keith (Solid/trysolid.com) for Effect-TS monorepo (200K+ functions across 30+ packages, only needs 2).
1 parent 7a563d3 commit 781fdd2

2 files changed

Lines changed: 91 additions & 25 deletions

File tree

backend/routes/repos.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""Repository management routes - CRUD and indexing."""
22
from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect, Depends, BackgroundTasks
33
from pydantic import BaseModel
4-
from typing import Optional
4+
from typing import List, Optional
5+
from pathlib import Path
56
import hashlib
67
import time
78
import asyncio
@@ -177,6 +178,47 @@ async def delete_repository(
177178
raise HTTPException(status_code=500, detail="Failed to delete repository")
178179

179180

181+
@router.get("/{repo_id}/directories")
182+
async def get_repo_directories(
183+
repo_id: str,
184+
auth: AuthContext = Depends(require_auth),
185+
):
186+
"""Return the top-level directory tree of a cloned repo.
187+
188+
Used for monorepo subset selection -- lets the user pick which
189+
directories to index instead of the entire repo.
190+
"""
191+
repo = get_repo_or_404(repo_id, auth.user_id)
192+
local_path = Path(repo["local_path"])
193+
194+
if not local_path.exists():
195+
raise HTTPException(status_code=404, detail="Repo not cloned yet")
196+
197+
skip = {"node_modules", ".git", "__pycache__", "venv", ".next", "dist", "build"}
198+
dirs = []
199+
for item in sorted(local_path.iterdir()):
200+
if item.is_dir() and item.name not in skip and not item.name.startswith("."):
201+
# count code files in this directory
202+
extensions = {".py", ".js", ".jsx", ".ts", ".tsx"}
203+
file_count = sum(
204+
1 for f in item.rglob("*")
205+
if f.is_file() and f.suffix in extensions
206+
and not any(s in f.parts for s in skip)
207+
)
208+
dirs.append({
209+
"name": item.name,
210+
"path": str(item.relative_to(local_path)),
211+
"file_count": file_count,
212+
})
213+
214+
return {
215+
"repo_id": repo_id,
216+
"repo_name": repo.get("name", local_path.name),
217+
"directories": dirs,
218+
"total_directories": len(dirs),
219+
}
220+
221+
180222
@router.post("/{repo_id}/index")
181223
async def index_repository(
182224
repo_id: str,
@@ -275,7 +317,8 @@ async def _run_async_indexing(
275317
repo_id: str,
276318
repo: dict,
277319
user_id: str,
278-
incremental: bool = True
320+
incremental: bool = True,
321+
include_paths: Optional[List[str]] = None,
279322
):
280323
"""
281324
Background task for async indexing with real-time progress.
@@ -349,7 +392,8 @@ async def progress_callback(
349392
total_functions = await indexer.index_repository_with_progress(
350393
repo_id,
351394
repo["local_path"],
352-
progress_callback
395+
progress_callback,
396+
include_paths=include_paths,
353397
)
354398
total_files = tracked_total_files
355399
index_type = "full"
@@ -400,11 +444,17 @@ async def progress_callback(
400444
)
401445

402446

447+
class IndexConfig(BaseModel):
448+
"""Optional config for indexing -- supports monorepo subset selection."""
449+
include_paths: Optional[List[str]] = None # e.g. ["packages/effect", "packages/schema"]
450+
incremental: bool = True
451+
452+
403453
@router.post("/{repo_id}/index/async", status_code=202)
404454
async def index_repository_async(
405455
repo_id: str,
406456
background_tasks: BackgroundTasks,
407-
incremental: bool = True,
457+
config: IndexConfig = IndexConfig(),
408458
auth: AuthContext = Depends(require_auth)
409459
):
410460
"""
@@ -463,14 +513,16 @@ async def index_repository_async(
463513
repo_id,
464514
repo,
465515
user_id,
466-
incremental
516+
incremental=config.incremental,
517+
include_paths=config.include_paths,
467518
)
468519

469520
return {
470521
"status": "indexing",
471522
"repo_id": repo_id,
472523
"message": "Indexing started. Connect to WebSocket for progress.",
473-
"websocket_url": f"/api/v1/ws/repos/{repo_id}/indexing"
524+
"websocket_url": f"/api/v1/ws/repos/{repo_id}/indexing",
525+
"include_paths": config.include_paths,
474526
}
475527

476528
except HTTPException:

backend/services/indexer_optimized.py

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -117,28 +117,32 @@ def _detect_language(self, file_path: str) -> Optional[str]:
117117
}
118118
return lang_map.get(ext)
119119

120-
def _discover_code_files(self, repo_path: str) -> List[Path]:
121-
"""Find all code files in repository"""
120+
def _discover_code_files(
121+
self, repo_path: str, include_paths: Optional[List[str]] = None
122+
) -> List[Path]:
123+
"""Find all code files in repository.
124+
125+
Args:
126+
include_paths: If set, only include files under these relative
127+
directories (e.g. ['packages/effect', 'packages/schema']).
128+
"""
122129
repo_path = Path(repo_path)
123130
code_files = []
124131

125-
# Extensions to index
126132
extensions = {'.py', '.js', '.jsx', '.ts', '.tsx'}
127-
128-
# Directories to skip
129133
skip_dirs = {'node_modules', '.git', '__pycache__', 'venv', 'env', 'dist', 'build', '.next', '.vscode'}
130134

131135
for file_path in repo_path.rglob('*'):
132-
# Skip directories
133136
if file_path.is_dir():
134137
continue
135-
136-
# Skip if in excluded directory
137138
if any(skip in file_path.parts for skip in skip_dirs):
138139
continue
139-
140-
# Check extension
141140
if file_path.suffix in extensions:
141+
# Subset filter: only include files under specified paths
142+
if include_paths:
143+
rel = str(file_path.relative_to(repo_path))
144+
if not any(rel.startswith(p) for p in include_paths):
145+
continue
142146
code_files.append(file_path)
143147

144148
return code_files
@@ -349,11 +353,16 @@ async def _extract_functions_from_file(
349353
logger.error("Error processing file", file_path=file_path, error=str(e))
350354
return []
351355

352-
def extract_functions_v2(self, repo_path: str, max_functions: int = 5000) -> List[ExtractedFunction]:
356+
def extract_functions_v2(
357+
self, repo_path: str, max_functions: int = 5000,
358+
include_paths: Optional[List[str]] = None,
359+
) -> List[ExtractedFunction]:
353360
"""Extract and filter functions using tree-sitter."""
354361
from pathlib import Path
355362

356-
raw = self.tree_sitter_extractor.extract_from_repo(Path(repo_path), max_functions=max_functions)
363+
raw = self.tree_sitter_extractor.extract_from_repo(
364+
Path(repo_path), include_paths=include_paths, max_functions=max_functions,
365+
)
357366
filtered = self.function_filter.filter_functions(raw)
358367

359368
logger.info("V2 extraction", total=len(raw), kept=len(filtered))
@@ -397,15 +406,17 @@ async def index_repository_v2(
397406
repo_id: str,
398407
repo_path: str,
399408
progress_callback=None,
400-
generate_summaries: bool = False
409+
generate_summaries: bool = False,
410+
include_paths: Optional[List[str]] = None,
401411
) -> int:
402412
"""Index repository using V2 function-level extraction."""
403413
from services.search_v2 import generate_summaries as gen_summaries
404414

405415
start_time = time.time()
406-
logger.info("V2 indexing started", repo_id=repo_id, with_summaries=generate_summaries)
416+
logger.info("V2 indexing started", repo_id=repo_id, with_summaries=generate_summaries,
417+
include_paths=include_paths)
407418

408-
functions = self.extract_functions_v2(repo_path)
419+
functions = self.extract_functions_v2(repo_path, include_paths=include_paths)
409420
if not functions:
410421
if progress_callback:
411422
await progress_callback(0, 0, 0)
@@ -691,18 +702,21 @@ async def index_repository_with_progress(
691702
repo_id: str,
692703
repo_path: str,
693704
progress_callback,
694-
max_files: int = None
705+
max_files: int = None,
706+
include_paths: Optional[List[str]] = None,
695707
):
696708
"""Index repository with real-time progress updates
697709
698710
Args:
699711
max_files: If set, limit indexing to first N files (for partial indexing)
712+
include_paths: If set, only index files under these directories
700713
"""
701714
start_time = time.time()
702-
logger.info("Starting optimized indexing with progress", repo_id=repo_id)
715+
logger.info("Starting optimized indexing with progress", repo_id=repo_id,
716+
include_paths=include_paths)
703717

704-
# Discover code files
705-
code_files = self._discover_code_files(repo_path)
718+
# Discover code files (filtered by include_paths if set)
719+
code_files = self._discover_code_files(repo_path, include_paths=include_paths)
706720

707721
# Apply file limit if specified (partial indexing)
708722
if max_files and len(code_files) > max_files:

0 commit comments

Comments
 (0)