From 65d19e24e6b8e36f4ad498bde1b8fb792dd078bb Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Fri, 6 Mar 2026 18:18:16 -0500 Subject: [PATCH 1/3] fix: persist include_paths to DB and pass to dependency analyzer (OPE-162) Root cause of 1000-node hairball: user indexes packages/sql + packages/vitest (26 files) but dependency graph scans entire Effect-TS clone (1,767 files). Fix: 1. Save include_paths to repositories table during indexing 2. All 3 analysis routes now pass repo.get('include_paths') to build_dependency_graph 3. Added force=true query param to dependencies endpoint to bypass stale cache built without include_paths filtering After re-indexing, Effect-TS subset repos will show 26 nodes instead of 1000. Closes OPE-162 --- backend/routes/analysis.py | 20 +++++++++++--------- backend/routes/repos.py | 5 +++++ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/backend/routes/analysis.py b/backend/routes/analysis.py index 4ce126c..bc59ea0 100644 --- a/backend/routes/analysis.py +++ b/backend/routes/analysis.py @@ -21,19 +21,21 @@ class ImpactRequest(BaseModel): @router.get("/{repo_id}/dependencies") async def get_dependency_graph( repo_id: str, + force: bool = False, auth: AuthContext = Depends(require_auth) ): - """Get dependency graph for repository.""" + """Get dependency graph for repository. Use force=true to rebuild from scratch.""" try: repo = get_repo_or_404(repo_id, auth.user_id) - cached_graph = dependency_analyzer.load_from_cache(repo_id) - if cached_graph: - logger.debug("Using cached dependency graph", repo_id=repo_id) - return {**cached_graph, "cached": True} + if not force: + cached_graph = dependency_analyzer.load_from_cache(repo_id) + if cached_graph: + logger.debug("Using cached dependency graph", repo_id=repo_id) + return {**cached_graph, "cached": True} - logger.info("Building fresh dependency graph", repo_id=repo_id) - graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"]) + logger.info("Building fresh dependency graph", repo_id=repo_id, include_paths=repo.get("include_paths")) + graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"], include_paths=repo.get("include_paths")) dependency_analyzer.save_to_cache(repo_id, graph_data) return {**graph_data, "cached": False} @@ -64,7 +66,7 @@ async def analyze_impact( graph_data = dependency_analyzer.load_from_cache(repo_id) if not graph_data: logger.info("Building dependency graph for impact analysis", repo_id=repo_id) - graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"]) + graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"], include_paths=repo.get("include_paths")) dependency_analyzer.save_to_cache(repo_id, graph_data) impact = dependency_analyzer.get_file_impact( @@ -94,7 +96,7 @@ async def get_repository_insights( graph_data = dependency_analyzer.load_from_cache(repo_id) if not graph_data: logger.info("Building dependency graph for insights", repo_id=repo_id) - graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"]) + graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"], include_paths=repo.get("include_paths")) dependency_analyzer.save_to_cache(repo_id, graph_data) return { diff --git a/backend/routes/repos.py b/backend/routes/repos.py index a630aaf..5facbb3 100644 --- a/backend/routes/repos.py +++ b/backend/routes/repos.py @@ -602,6 +602,11 @@ async def _run_async_indexing( repo_manager.update_status(repo_id, "indexing") + # Persist include_paths so dependency analyzer and other tools use the subset + if include_paths: + from services.supabase_service import get_supabase_service + get_supabase_service().update_repository(repo_id, {"include_paths": include_paths}) + # Publish initial progress to confirm connection if publisher: publisher.publish_progress(repo_id, 0, 1, 0, "Starting...") From 04ac019c69ec38c3d8886e29bea83df14cc4a705 Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Fri, 6 Mar 2026 18:28:22 -0500 Subject: [PATCH 2/3] fix: 3 edge cases -- clear stale include_paths, invalidate dep cache, sanitize corrupt DB data Adversarial review found 3 bugs: 1. CRITICAL: Re-indexing full repo didn't clear old include_paths. if include_paths was truthy before save, re-index with None skipped the DB update. Now ALWAYS writes include_paths (None clears it). 2. HIGH: Stale dep cache after re-index with different include_paths. Dependency cache persisted across re-indexes. Now clear_file_dependencies runs at start of every re-index so graph rebuilds with new include_paths. 3. MEDIUM: Corrupt jsonb from DB (e.g. [123, null]) crashed Path().parts. build_dependency_graph now filters non-string entries and falls back to full scan if all entries are invalid. 3 new tests: - test_include_paths_with_corrupt_data: mixed valid/invalid entries - test_include_paths_all_corrupt_scans_everything: all invalid -> full scan - test_include_paths_empty_list_scans_everything: [] same as None --- backend/routes/repos.py | 11 +++++---- backend/services/dependency_analyzer.py | 6 +++++ backend/tests/test_dependency_analyzer.py | 28 +++++++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/backend/routes/repos.py b/backend/routes/repos.py index 5facbb3..46421ae 100644 --- a/backend/routes/repos.py +++ b/backend/routes/repos.py @@ -602,10 +602,13 @@ async def _run_async_indexing( repo_manager.update_status(repo_id, "indexing") - # Persist include_paths so dependency analyzer and other tools use the subset - if include_paths: - from services.supabase_service import get_supabase_service - get_supabase_service().update_repository(repo_id, {"include_paths": include_paths}) + # Persist include_paths (or clear it if re-indexing full repo) + from services.supabase_service import get_supabase_service + db = get_supabase_service() + db.update_repository(repo_id, {"include_paths": include_paths}) + + # Clear stale dependency cache so next graph build uses new include_paths + db.clear_file_dependencies(repo_id) # Publish initial progress to confirm connection if publisher: diff --git a/backend/services/dependency_analyzer.py b/backend/services/dependency_analyzer.py index 927efc6..7966235 100644 --- a/backend/services/dependency_analyzer.py +++ b/backend/services/dependency_analyzer.py @@ -142,6 +142,12 @@ def build_dependency_graph(self, repo_path: str, include_paths: List[str] = None """Build dependency graph. If include_paths set, only analyze those dirs.""" repo_path = Path(repo_path) + # Sanitize include_paths from DB (could be corrupt jsonb) + if include_paths: + include_paths = [p for p in include_paths if isinstance(p, str) and p.strip()] + if not include_paths: + include_paths = None + # Discover code files code_files = [] extensions = {'.py', '.js', '.jsx', '.ts', '.tsx'} diff --git a/backend/tests/test_dependency_analyzer.py b/backend/tests/test_dependency_analyzer.py index 317dfe1..a36f8fb 100644 --- a/backend/tests/test_dependency_analyzer.py +++ b/backend/tests/test_dependency_analyzer.py @@ -280,6 +280,34 @@ def test_include_paths_multiple_dirs(self, analyzer, ts_repo): assert any('packages/schema' in f for f in file_paths) assert not any('backend' in f for f in file_paths) + def test_include_paths_with_corrupt_data(self, analyzer, ts_repo): + """Corrupt jsonb from DB should not crash -- non-strings are filtered out""" + graph = analyzer.build_dependency_graph( + str(ts_repo), + include_paths=[123, None, '', 'packages/effect', True] + ) + file_paths = set(graph['dependencies'].keys()) + # Should only include effect files, corrupt entries filtered + assert all('packages/effect' in f for f in file_paths) + assert len(file_paths) > 0 + + def test_include_paths_all_corrupt_scans_everything(self, analyzer, ts_repo): + """If all include_paths entries are invalid, fall back to full scan""" + graph = analyzer.build_dependency_graph( + str(ts_repo), + include_paths=[123, None, '', False] + ) + file_paths = set(graph['dependencies'].keys()) + # Should fall back to scanning everything + assert any('backend' in f for f in file_paths) + assert any('packages/effect' in f for f in file_paths) + + def test_include_paths_empty_list_scans_everything(self, analyzer, ts_repo): + """Empty list should be treated same as None""" + graph = analyzer.build_dependency_graph(str(ts_repo), include_paths=[]) + file_paths = set(graph['dependencies'].keys()) + assert any('backend' in f for f in file_paths) + class TestGraphMetrics: """Verify graph statistics are correct""" From 9c9f20ff2e2b73f85c87df8abcd0f898021d9bc8 Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Fri, 6 Mar 2026 18:41:19 -0500 Subject: [PATCH 3/3] fix: normalize include_paths -- strip backslashes, reject traversal, defense in depth Sanitizer now matches the validation in IndexConfig.sanitize_paths: - Backslashes replaced with forward slashes - Leading/trailing whitespace and slashes stripped - Path traversal (..) entries rejected - Non-string and empty entries filtered 2 new tests: traversal rejection, backslash normalization. 35 total pass. Skipped findings: - Cache keyed by include_paths: not needed, cache cleared on re-index - repos.py truthy-only write: already fixed in previous commit - Test type hints: entire suite has zero, adding to 3 tests is inconsistent --- backend/services/dependency_analyzer.py | 12 +++++++++--- backend/tests/test_dependency_analyzer.py | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/backend/services/dependency_analyzer.py b/backend/services/dependency_analyzer.py index 7966235..d28062d 100644 --- a/backend/services/dependency_analyzer.py +++ b/backend/services/dependency_analyzer.py @@ -144,9 +144,15 @@ def build_dependency_graph(self, repo_path: str, include_paths: List[str] = None # Sanitize include_paths from DB (could be corrupt jsonb) if include_paths: - include_paths = [p for p in include_paths if isinstance(p, str) and p.strip()] - if not include_paths: - include_paths = None + cleaned = [] + for p in include_paths: + if not isinstance(p, str): + continue + p = p.replace('\\', '/').strip().strip('/') + if not p or '..' in p.split('/'): + continue + cleaned.append(p) + include_paths = cleaned or None # Discover code files code_files = [] diff --git a/backend/tests/test_dependency_analyzer.py b/backend/tests/test_dependency_analyzer.py index a36f8fb..dcf7518 100644 --- a/backend/tests/test_dependency_analyzer.py +++ b/backend/tests/test_dependency_analyzer.py @@ -308,6 +308,27 @@ def test_include_paths_empty_list_scans_everything(self, analyzer, ts_repo): file_paths = set(graph['dependencies'].keys()) assert any('backend' in f for f in file_paths) + def test_include_paths_traversal_rejected(self, analyzer, ts_repo): + """Path traversal attempts should be stripped, not crash""" + graph = analyzer.build_dependency_graph( + str(ts_repo), + include_paths=['../etc/passwd', 'packages/effect', '../../secrets'] + ) + file_paths = set(graph['dependencies'].keys()) + # Traversal entries filtered, only packages/effect remains + assert all('packages/effect' in f for f in file_paths) + assert len(file_paths) > 0 + + def test_include_paths_backslash_normalized(self, analyzer, ts_repo): + """Windows-style backslashes should be normalized""" + graph = analyzer.build_dependency_graph( + str(ts_repo), + include_paths=['packages\\effect'] + ) + file_paths = set(graph['dependencies'].keys()) + assert all('packages/effect' in f for f in file_paths) + assert len(file_paths) > 0 + class TestGraphMetrics: """Verify graph statistics are correct"""