refactor: consolidate test file detection into shared utility

DevanshuNEU · DevanshuNEU · commit 72605e6decef · 2026-01-26T16:43:07.000-05:00
- Create utils/test_detection.py as single source of truth
- Reuse in indexer_optimized.py, final_v3_test.py, code_graph_ranker.py
- Remove duplicate is_test_file implementations
- All patterns now use regex for consistency
diff --git a/backend/scripts/final_v3_test.py b/backend/scripts/final_v3_test.py
@@ -22,6 +22,7 @@
     sys.exit(1)
 
 from services.indexer_optimized import OptimizedCodeIndexer
+from utils.test_detection import has_test_file_in_top_n as has_test_file
 
 # All query types combined
 ALL_QUERIES = [
@@ -43,25 +44,6 @@
 ]
 
 
-def has_test_file(results):
-    """Check if any of top 3 results are test files (stricter pattern matching)"""
-    import os
-    for r in results[:3]:
-        fp = r.get("file_path", "").lower()
-        # check for test directories
-        if "/test/" in fp or "/tests/" in fp:
-            return True
-        # check basename patterns
-        basename = os.path.basename(fp)
-        if basename.startswith("test_") or basename.startswith("test."):
-            return True
-        if "_test." in basename or basename.endswith("_test.py"):
-            return True
-        if ".spec." in basename:
-            return True
-    return False
-
-
 async def run_final_test():
     print()
     print("╔" + "═" * 68 + "╗")
diff --git a/backend/services/indexer_optimized.py b/backend/services/indexer_optimized.py
@@ -14,6 +14,8 @@
 import asyncio
 from collections import defaultdict
 
+from utils.test_detection import is_test_file, filter_test_files
+
 # Tree-sitter for parsing
 import tree_sitter_python as tspython
 import tree_sitter_javascript as tsjavascript
@@ -636,25 +638,8 @@ async def search_v3(
             results = await self.search_v2(query, repo_id, top_k, use_reranking)
             # apply test filtering to V2 results (V2 doesn't filter tests by default)
             if not include_tests:
-                results = [r for r in results if not self._is_test_file(r.get("file_path", ""))]
+                results = filter_test_files(results)
             return results
-    
-    def _is_test_file(self, file_path: str) -> bool:
-        """Check if file is a test file (stricter pattern matching)"""
-        import os
-        fp = file_path.lower()
-        # test directories
-        if "/test/" in fp or "/tests/" in fp:
-            return True
-        # basename patterns
-        basename = os.path.basename(fp)
-        if basename.startswith("test_") or basename.startswith("test."):
-            return True
-        if "_test." in basename or basename.endswith("_test.py"):
-            return True
-        if ".spec." in basename:
-            return True
-        return False
 
     async def explain_code(
         self,
diff --git a/backend/services/search_v3/code_graph_ranker.py b/backend/services/search_v3/code_graph_ranker.py
@@ -7,6 +7,7 @@
 from dataclasses import dataclass
 
 from services.observability import logger
+from utils.test_detection import is_test_file as shared_is_test_file
 
 
 @dataclass
@@ -30,18 +31,6 @@ class CodeGraphRanker:
     4. Core file boost (main, index, app files)
     """
     
-    # patterns for test files
-    TEST_PATTERNS = [
-        r'test[s]?[/_]',          # test/, tests/, test_
-        r'[/_]test[s]?\.py$',     # _test.py, _tests.py
-        r'\.test\.[jt]sx?$',      # .test.js, .test.ts
-        r'\.spec\.[jt]sx?$',      # .spec.js, .spec.ts  
-        r'__tests__',             # __tests__/
-        r'conftest\.py$',         # pytest config
-        r'fixtures?[/_]',         # fixtures/
-        r'mock[s]?[/_]',          # mocks/
-    ]
-    
     # patterns for core files (boost these)
     CORE_PATTERNS = [
         r'main\.[a-z]+$',
@@ -135,12 +124,8 @@ def calculate_importance(
         return importance_map
     
     def _is_test_file(self, file_path: str) -> bool:
-        """Check if file is a test file"""
-        file_path_lower = file_path.lower()
-        for pattern in self.TEST_PATTERNS:
-            if re.search(pattern, file_path_lower):
-                return True
-        return False
+        """Check if file is a test file (uses shared utility)"""
+        return shared_is_test_file(file_path)
     
     def _is_core_file(self, file_path: str) -> bool:
         """Check if file is a core/important file"""
diff --git a/backend/utils/__init__.py b/backend/utils/__init__.py
@@ -0,0 +1 @@
+# Utils module
diff --git a/backend/utils/test_detection.py b/backend/utils/test_detection.py
@@ -0,0 +1,72 @@
+"""
+Shared test file detection utilities.
+Single source of truth for test file patterns across V2/V3 search.
+"""
+import re
+from typing import List
+
+
+# Regex patterns for test files (consolidated from CodeGraphRanker)
+TEST_PATTERNS = [
+    r'test[s]?[/_]',          # test/, tests/, test_
+    r'[/_]test[s]?\.py$',     # _test.py, _tests.py
+    r'\.test\.[jt]sx?$',      # .test.js, .test.ts
+    r'\.spec\.[jt]sx?$',      # .spec.js, .spec.ts  
+    r'__tests__',             # __tests__/
+    r'conftest\.py$',         # pytest config
+    r'fixtures?[/_]',         # fixtures/
+    r'mock[s]?[/_]',          # mocks/
+]
+
+
+def is_test_file(file_path: str) -> bool:
+    """
+    Check if file is a test file using regex patterns.
+    
+    Args:
+        file_path: Path to check (can be relative or absolute)
+        
+    Returns:
+        True if file matches any test pattern
+    """
+    if not file_path:
+        return False
+    file_path_lower = file_path.lower()
+    for pattern in TEST_PATTERNS:
+        if re.search(pattern, file_path_lower):
+            return True
+    return False
+
+
+def filter_test_files(results: List[dict], include_tests: bool = False) -> List[dict]:
+    """
+    Filter test files from search results.
+    
+    Args:
+        results: List of search result dicts with 'file_path' key
+        include_tests: If True, keep test files; if False, filter them out
+        
+    Returns:
+        Filtered results list
+    """
+    if include_tests:
+        return results
+    return [r for r in results if not is_test_file(r.get("file_path", ""))]
+
+
+def has_test_file_in_top_n(results: List[dict], n: int = 3) -> bool:
+    """
+    Check if any of the top N results are test files.
+    Useful for benchmarking test pollution.
+    
+    Args:
+        results: List of search result dicts
+        n: Number of top results to check
+        
+    Returns:
+        True if any top N result is a test file
+    """
+    for r in results[:n]:
+        if is_test_file(r.get("file_path", "")):
+            return True
+    return False