From 65d19e24e6b8e36f4ad498bde1b8fb792dd078bb Mon Sep 17 00:00:00 2001
From: Devanshu Rajesh Chicholikar <chicholikar.d@northeastern.edu>
Date: Fri, 6 Mar 2026 18:18:16 -0500
Subject: [PATCH 1/3] fix: persist include_paths to DB and pass to dependency
 analyzer (OPE-162)

Root cause of 1000-node hairball: user indexes packages/sql + packages/vitest
(26 files) but dependency graph scans entire Effect-TS clone (1,767 files).

Fix:
1. Save include_paths to repositories table during indexing
2. All 3 analysis routes now pass repo.get('include_paths') to
   build_dependency_graph
3. Added force=true query param to dependencies endpoint to bypass
   stale cache built without include_paths filtering

After re-indexing, Effect-TS subset repos will show 26 nodes instead of 1000.

Closes OPE-162
---
 backend/routes/analysis.py | 20 +++++++++++---------
 backend/routes/repos.py    |  5 +++++
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/backend/routes/analysis.py b/backend/routes/analysis.py
index 4ce126c..bc59ea0 100644
--- a/backend/routes/analysis.py
+++ b/backend/routes/analysis.py
@@ -21,19 +21,21 @@ class ImpactRequest(BaseModel):
 @router.get("/{repo_id}/dependencies")
 async def get_dependency_graph(
     repo_id: str,
+    force: bool = False,
     auth: AuthContext = Depends(require_auth)
 ):
-    """Get dependency graph for repository."""
+    """Get dependency graph for repository. Use force=true to rebuild from scratch."""
     try:
         repo = get_repo_or_404(repo_id, auth.user_id)
 
-        cached_graph = dependency_analyzer.load_from_cache(repo_id)
-        if cached_graph:
-            logger.debug("Using cached dependency graph", repo_id=repo_id)
-            return {**cached_graph, "cached": True}
+        if not force:
+            cached_graph = dependency_analyzer.load_from_cache(repo_id)
+            if cached_graph:
+                logger.debug("Using cached dependency graph", repo_id=repo_id)
+                return {**cached_graph, "cached": True}
 
-        logger.info("Building fresh dependency graph", repo_id=repo_id)
-        graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"])
+        logger.info("Building fresh dependency graph", repo_id=repo_id, include_paths=repo.get("include_paths"))
+        graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"], include_paths=repo.get("include_paths"))
         dependency_analyzer.save_to_cache(repo_id, graph_data)
 
         return {**graph_data, "cached": False}
@@ -64,7 +66,7 @@ async def analyze_impact(
         graph_data = dependency_analyzer.load_from_cache(repo_id)
         if not graph_data:
             logger.info("Building dependency graph for impact analysis", repo_id=repo_id)
-            graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"])
+            graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"], include_paths=repo.get("include_paths"))
             dependency_analyzer.save_to_cache(repo_id, graph_data)
 
         impact = dependency_analyzer.get_file_impact(
@@ -94,7 +96,7 @@ async def get_repository_insights(
         graph_data = dependency_analyzer.load_from_cache(repo_id)
         if not graph_data:
             logger.info("Building dependency graph for insights", repo_id=repo_id)
-            graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"])
+            graph_data = dependency_analyzer.build_dependency_graph(repo["local_path"], include_paths=repo.get("include_paths"))
             dependency_analyzer.save_to_cache(repo_id, graph_data)
 
         return {
diff --git a/backend/routes/repos.py b/backend/routes/repos.py
index a630aaf..5facbb3 100644
--- a/backend/routes/repos.py
+++ b/backend/routes/repos.py
@@ -602,6 +602,11 @@ async def _run_async_indexing(
         
         repo_manager.update_status(repo_id, "indexing")
         
+        # Persist include_paths so dependency analyzer and other tools use the subset
+        if include_paths:
+            from services.supabase_service import get_supabase_service
+            get_supabase_service().update_repository(repo_id, {"include_paths": include_paths})
+        
         # Publish initial progress to confirm connection
         if publisher:
             publisher.publish_progress(repo_id, 0, 1, 0, "Starting...")

From 04ac019c69ec38c3d8886e29bea83df14cc4a705 Mon Sep 17 00:00:00 2001
From: Devanshu Rajesh Chicholikar <chicholikar.d@northeastern.edu>
Date: Fri, 6 Mar 2026 18:28:22 -0500
Subject: [PATCH 2/3] fix: 3 edge cases -- clear stale include_paths,
 invalidate dep cache, sanitize corrupt DB data

Adversarial review found 3 bugs:

1. CRITICAL: Re-indexing full repo didn't clear old include_paths.
   if include_paths was truthy before save, re-index with None skipped
   the DB update. Now ALWAYS writes include_paths (None clears it).

2. HIGH: Stale dep cache after re-index with different include_paths.
   Dependency cache persisted across re-indexes. Now clear_file_dependencies
   runs at start of every re-index so graph rebuilds with new include_paths.

3. MEDIUM: Corrupt jsonb from DB (e.g. [123, null]) crashed Path().parts.
   build_dependency_graph now filters non-string entries and falls back to
   full scan if all entries are invalid.

3 new tests:
- test_include_paths_with_corrupt_data: mixed valid/invalid entries
- test_include_paths_all_corrupt_scans_everything: all invalid -> full scan
- test_include_paths_empty_list_scans_everything: [] same as None
---
 backend/routes/repos.py                   | 11 +++++----
 backend/services/dependency_analyzer.py   |  6 +++++
 backend/tests/test_dependency_analyzer.py | 28 +++++++++++++++++++++++
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/backend/routes/repos.py b/backend/routes/repos.py
index 5facbb3..46421ae 100644
--- a/backend/routes/repos.py
+++ b/backend/routes/repos.py
@@ -602,10 +602,13 @@ async def _run_async_indexing(
         
         repo_manager.update_status(repo_id, "indexing")
         
-        # Persist include_paths so dependency analyzer and other tools use the subset
-        if include_paths:
-            from services.supabase_service import get_supabase_service
-            get_supabase_service().update_repository(repo_id, {"include_paths": include_paths})
+        # Persist include_paths (or clear it if re-indexing full repo)
+        from services.supabase_service import get_supabase_service
+        db = get_supabase_service()
+        db.update_repository(repo_id, {"include_paths": include_paths})
+
+        # Clear stale dependency cache so next graph build uses new include_paths
+        db.clear_file_dependencies(repo_id)
         
         # Publish initial progress to confirm connection
         if publisher:
diff --git a/backend/services/dependency_analyzer.py b/backend/services/dependency_analyzer.py
index 927efc6..7966235 100644
--- a/backend/services/dependency_analyzer.py
+++ b/backend/services/dependency_analyzer.py
@@ -142,6 +142,12 @@ def build_dependency_graph(self, repo_path: str, include_paths: List[str] = None
         """Build dependency graph. If include_paths set, only analyze those dirs."""
         repo_path = Path(repo_path)
         
+        # Sanitize include_paths from DB (could be corrupt jsonb)
+        if include_paths:
+            include_paths = [p for p in include_paths if isinstance(p, str) and p.strip()]
+            if not include_paths:
+                include_paths = None
+        
         # Discover code files
         code_files = []
         extensions = {'.py', '.js', '.jsx', '.ts', '.tsx'}
diff --git a/backend/tests/test_dependency_analyzer.py b/backend/tests/test_dependency_analyzer.py
index 317dfe1..a36f8fb 100644
--- a/backend/tests/test_dependency_analyzer.py
+++ b/backend/tests/test_dependency_analyzer.py
@@ -280,6 +280,34 @@ def test_include_paths_multiple_dirs(self, analyzer, ts_repo):
         assert any('packages/schema' in f for f in file_paths)
         assert not any('backend' in f for f in file_paths)
 
+    def test_include_paths_with_corrupt_data(self, analyzer, ts_repo):
+        """Corrupt jsonb from DB should not crash -- non-strings are filtered out"""
+        graph = analyzer.build_dependency_graph(
+            str(ts_repo),
+            include_paths=[123, None, '', 'packages/effect', True]
+        )
+        file_paths = set(graph['dependencies'].keys())
+        # Should only include effect files, corrupt entries filtered
+        assert all('packages/effect' in f for f in file_paths)
+        assert len(file_paths) > 0
+
+    def test_include_paths_all_corrupt_scans_everything(self, analyzer, ts_repo):
+        """If all include_paths entries are invalid, fall back to full scan"""
+        graph = analyzer.build_dependency_graph(
+            str(ts_repo),
+            include_paths=[123, None, '', False]
+        )
+        file_paths = set(graph['dependencies'].keys())
+        # Should fall back to scanning everything
+        assert any('backend' in f for f in file_paths)
+        assert any('packages/effect' in f for f in file_paths)
+
+    def test_include_paths_empty_list_scans_everything(self, analyzer, ts_repo):
+        """Empty list should be treated same as None"""
+        graph = analyzer.build_dependency_graph(str(ts_repo), include_paths=[])
+        file_paths = set(graph['dependencies'].keys())
+        assert any('backend' in f for f in file_paths)
+
 
 class TestGraphMetrics:
     """Verify graph statistics are correct"""

From 9c9f20ff2e2b73f85c87df8abcd0f898021d9bc8 Mon Sep 17 00:00:00 2001
From: Devanshu Rajesh Chicholikar <chicholikar.d@northeastern.edu>
Date: Fri, 6 Mar 2026 18:41:19 -0500
Subject: [PATCH 3/3] fix: normalize include_paths -- strip backslashes, reject
 traversal, defense in depth

Sanitizer now matches the validation in IndexConfig.sanitize_paths:
- Backslashes replaced with forward slashes
- Leading/trailing whitespace and slashes stripped
- Path traversal (..) entries rejected
- Non-string and empty entries filtered

2 new tests: traversal rejection, backslash normalization. 35 total pass.

Skipped findings:
- Cache keyed by include_paths: not needed, cache cleared on re-index
- repos.py truthy-only write: already fixed in previous commit
- Test type hints: entire suite has zero, adding to 3 tests is inconsistent
---
 backend/services/dependency_analyzer.py   | 12 +++++++++---
 backend/tests/test_dependency_analyzer.py | 21 +++++++++++++++++++++
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/backend/services/dependency_analyzer.py b/backend/services/dependency_analyzer.py
index 7966235..d28062d 100644
--- a/backend/services/dependency_analyzer.py
+++ b/backend/services/dependency_analyzer.py
@@ -144,9 +144,15 @@ def build_dependency_graph(self, repo_path: str, include_paths: List[str] = None
         
         # Sanitize include_paths from DB (could be corrupt jsonb)
         if include_paths:
-            include_paths = [p for p in include_paths if isinstance(p, str) and p.strip()]
-            if not include_paths:
-                include_paths = None
+            cleaned = []
+            for p in include_paths:
+                if not isinstance(p, str):
+                    continue
+                p = p.replace('\\', '/').strip().strip('/')
+                if not p or '..' in p.split('/'):
+                    continue
+                cleaned.append(p)
+            include_paths = cleaned or None
         
         # Discover code files
         code_files = []
diff --git a/backend/tests/test_dependency_analyzer.py b/backend/tests/test_dependency_analyzer.py
index a36f8fb..dcf7518 100644
--- a/backend/tests/test_dependency_analyzer.py
+++ b/backend/tests/test_dependency_analyzer.py
@@ -308,6 +308,27 @@ def test_include_paths_empty_list_scans_everything(self, analyzer, ts_repo):
         file_paths = set(graph['dependencies'].keys())
         assert any('backend' in f for f in file_paths)
 
+    def test_include_paths_traversal_rejected(self, analyzer, ts_repo):
+        """Path traversal attempts should be stripped, not crash"""
+        graph = analyzer.build_dependency_graph(
+            str(ts_repo),
+            include_paths=['../etc/passwd', 'packages/effect', '../../secrets']
+        )
+        file_paths = set(graph['dependencies'].keys())
+        # Traversal entries filtered, only packages/effect remains
+        assert all('packages/effect' in f for f in file_paths)
+        assert len(file_paths) > 0
+
+    def test_include_paths_backslash_normalized(self, analyzer, ts_repo):
+        """Windows-style backslashes should be normalized"""
+        graph = analyzer.build_dependency_graph(
+            str(ts_repo),
+            include_paths=['packages\\effect']
+        )
+        file_paths = set(graph['dependencies'].keys())
+        assert all('packages/effect' in f for f in file_paths)
+        assert len(file_paths) > 0
+
 
 class TestGraphMetrics:
     """Verify graph statistics are correct"""