Skip to content

Commit 022c297

Browse files
committed
fix: address PR review -- 4 verified findings
1. Add -> int return annotation to index_repository_with_progress 2. Skip incremental indexing when include_paths is set (incremental_index_repository uses git diff which doesn't understand subset boundaries) 3. Move blocking rglob() out of async handler into _scan_directories() sync helper, called via asyncio.to_thread() to avoid blocking the event loop on large repos 4. Replace full-repo rglob + startswith filter in _discover_code_files with targeted subtree walking (only iterate include_paths dirs) -- eliminates prefix collision risk (packages/effectx no longer matches packages/effect)
1 parent 781fdd2 commit 022c297

2 files changed

Lines changed: 55 additions & 33 deletions

File tree

backend/routes/repos.py

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -178,28 +178,17 @@ async def delete_repository(
178178
raise HTTPException(status_code=500, detail="Failed to delete repository")
179179

180180

181-
@router.get("/{repo_id}/directories")
182-
async def get_repo_directories(
183-
repo_id: str,
184-
auth: AuthContext = Depends(require_auth),
185-
):
186-
"""Return the top-level directory tree of a cloned repo.
181+
def _scan_directories(local_path: Path) -> list:
182+
"""Scan top-level directories and count code files in each.
187183
188-
Used for monorepo subset selection -- lets the user pick which
189-
directories to index instead of the entire repo.
184+
Runs synchronously -- call via asyncio.to_thread() from async handlers
185+
to avoid blocking the event loop on large repos.
190186
"""
191-
repo = get_repo_or_404(repo_id, auth.user_id)
192-
local_path = Path(repo["local_path"])
193-
194-
if not local_path.exists():
195-
raise HTTPException(status_code=404, detail="Repo not cloned yet")
196-
197187
skip = {"node_modules", ".git", "__pycache__", "venv", ".next", "dist", "build"}
188+
extensions = {".py", ".js", ".jsx", ".ts", ".tsx"}
198189
dirs = []
199190
for item in sorted(local_path.iterdir()):
200191
if item.is_dir() and item.name not in skip and not item.name.startswith("."):
201-
# count code files in this directory
202-
extensions = {".py", ".js", ".jsx", ".ts", ".tsx"}
203192
file_count = sum(
204193
1 for f in item.rglob("*")
205194
if f.is_file() and f.suffix in extensions
@@ -210,6 +199,26 @@ async def get_repo_directories(
210199
"path": str(item.relative_to(local_path)),
211200
"file_count": file_count,
212201
})
202+
return dirs
203+
204+
205+
@router.get("/{repo_id}/directories")
206+
async def get_repo_directories(
207+
repo_id: str,
208+
auth: AuthContext = Depends(require_auth),
209+
):
210+
"""Return the top-level directory tree of a cloned repo.
211+
212+
Used for monorepo subset selection -- lets the user pick which
213+
directories to index instead of the entire repo.
214+
"""
215+
repo = get_repo_or_404(repo_id, auth.user_id)
216+
local_path = Path(repo["local_path"])
217+
218+
if not local_path.exists():
219+
raise HTTPException(status_code=404, detail="Repo not cloned yet")
220+
221+
dirs = await asyncio.to_thread(_scan_directories, local_path)
213222

214223
return {
215224
"repo_id": repo_id,
@@ -341,9 +350,12 @@ async def _run_async_indexing(
341350
publisher.publish_progress(repo_id, 0, 1, 0, "Starting...")
342351

343352
# Check for incremental
353+
# Skip incremental when include_paths is set -- incremental_index_repository
354+
# uses git diff which doesn't understand subset boundaries
344355
last_commit = repo_manager.get_last_indexed_commit(repo_id)
356+
can_incremental = incremental and last_commit and not include_paths
345357

346-
if incremental and last_commit:
358+
if can_incremental:
347359
logger.info("Async INCREMENTAL indexing", repo_id=repo_id, last_commit=last_commit[:8])
348360
total_functions = await indexer.incremental_index_repository(
349361
repo_id,

backend/services/indexer_optimized.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -125,26 +125,36 @@ def _discover_code_files(
125125
Args:
126126
include_paths: If set, only include files under these relative
127127
directories (e.g. ['packages/effect', 'packages/schema']).
128+
Uses path-component-aware matching and only walks the
129+
specified subtrees instead of the entire repo.
128130
"""
129131
repo_path = Path(repo_path)
130132
code_files = []
131-
133+
132134
extensions = {'.py', '.js', '.jsx', '.ts', '.tsx'}
133135
skip_dirs = {'node_modules', '.git', '__pycache__', 'venv', 'env', 'dist', 'build', '.next', '.vscode'}
134-
135-
for file_path in repo_path.rglob('*'):
136-
if file_path.is_dir():
137-
continue
138-
if any(skip in file_path.parts for skip in skip_dirs):
139-
continue
140-
if file_path.suffix in extensions:
141-
# Subset filter: only include files under specified paths
142-
if include_paths:
143-
rel = str(file_path.relative_to(repo_path))
144-
if not any(rel.startswith(p) for p in include_paths):
145-
continue
146-
code_files.append(file_path)
147-
136+
137+
# When include_paths is set, only walk those subtrees
138+
if include_paths:
139+
roots = []
140+
for p in include_paths:
141+
subtree = repo_path / p
142+
if subtree.is_dir():
143+
roots.append(subtree)
144+
else:
145+
logger.warning("include_path not found, skipping: %s", p)
146+
else:
147+
roots = [repo_path]
148+
149+
for root in roots:
150+
for file_path in root.rglob('*'):
151+
if file_path.is_dir():
152+
continue
153+
if any(skip in file_path.parts for skip in skip_dirs):
154+
continue
155+
if file_path.suffix in extensions:
156+
code_files.append(file_path)
157+
148158
return code_files
149159

150160
async def _create_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
@@ -704,7 +714,7 @@ async def index_repository_with_progress(
704714
progress_callback,
705715
max_files: int = None,
706716
include_paths: Optional[List[str]] = None,
707-
):
717+
) -> int:
708718
"""Index repository with real-time progress updates
709719
710720
Args:

0 commit comments

Comments
 (0)