Skip to content

Commit 72605e6

Browse files
committed
refactor: consolidate test file detection into shared utility
- Create utils/test_detection.py as single source of truth - Reuse in indexer_optimized.py, final_v3_test.py, code_graph_ranker.py - Remove duplicate is_test_file implementations - All patterns now use regex for consistency
1 parent 757bd07 commit 72605e6

5 files changed

Lines changed: 80 additions & 55 deletions

File tree

backend/scripts/final_v3_test.py

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
sys.exit(1)
2323

2424
from services.indexer_optimized import OptimizedCodeIndexer
25+
from utils.test_detection import has_test_file_in_top_n as has_test_file
2526

2627
# All query types combined
2728
ALL_QUERIES = [
@@ -43,25 +44,6 @@
4344
]
4445

4546

46-
def has_test_file(results):
47-
"""Check if any of top 3 results are test files (stricter pattern matching)"""
48-
import os
49-
for r in results[:3]:
50-
fp = r.get("file_path", "").lower()
51-
# check for test directories
52-
if "/test/" in fp or "/tests/" in fp:
53-
return True
54-
# check basename patterns
55-
basename = os.path.basename(fp)
56-
if basename.startswith("test_") or basename.startswith("test."):
57-
return True
58-
if "_test." in basename or basename.endswith("_test.py"):
59-
return True
60-
if ".spec." in basename:
61-
return True
62-
return False
63-
64-
6547
async def run_final_test():
6648
print()
6749
print("╔" + "═" * 68 + "╗")

backend/services/indexer_optimized.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import asyncio
1515
from collections import defaultdict
1616

17+
from utils.test_detection import is_test_file, filter_test_files
18+
1719
# Tree-sitter for parsing
1820
import tree_sitter_python as tspython
1921
import tree_sitter_javascript as tsjavascript
@@ -636,25 +638,8 @@ async def search_v3(
636638
results = await self.search_v2(query, repo_id, top_k, use_reranking)
637639
# apply test filtering to V2 results (V2 doesn't filter tests by default)
638640
if not include_tests:
639-
results = [r for r in results if not self._is_test_file(r.get("file_path", ""))]
641+
results = filter_test_files(results)
640642
return results
641-
642-
def _is_test_file(self, file_path: str) -> bool:
643-
"""Check if file is a test file (stricter pattern matching)"""
644-
import os
645-
fp = file_path.lower()
646-
# test directories
647-
if "/test/" in fp or "/tests/" in fp:
648-
return True
649-
# basename patterns
650-
basename = os.path.basename(fp)
651-
if basename.startswith("test_") or basename.startswith("test."):
652-
return True
653-
if "_test." in basename or basename.endswith("_test.py"):
654-
return True
655-
if ".spec." in basename:
656-
return True
657-
return False
658643

659644
async def explain_code(
660645
self,

backend/services/search_v3/code_graph_ranker.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from dataclasses import dataclass
88

99
from services.observability import logger
10+
from utils.test_detection import is_test_file as shared_is_test_file
1011

1112

1213
@dataclass
@@ -30,18 +31,6 @@ class CodeGraphRanker:
3031
4. Core file boost (main, index, app files)
3132
"""
3233

33-
# patterns for test files
34-
TEST_PATTERNS = [
35-
r'test[s]?[/_]', # test/, tests/, test_
36-
r'[/_]test[s]?\.py$', # _test.py, _tests.py
37-
r'\.test\.[jt]sx?$', # .test.js, .test.ts
38-
r'\.spec\.[jt]sx?$', # .spec.js, .spec.ts
39-
r'__tests__', # __tests__/
40-
r'conftest\.py$', # pytest config
41-
r'fixtures?[/_]', # fixtures/
42-
r'mock[s]?[/_]', # mocks/
43-
]
44-
4534
# patterns for core files (boost these)
4635
CORE_PATTERNS = [
4736
r'main\.[a-z]+$',
@@ -135,12 +124,8 @@ def calculate_importance(
135124
return importance_map
136125

137126
def _is_test_file(self, file_path: str) -> bool:
138-
"""Check if file is a test file"""
139-
file_path_lower = file_path.lower()
140-
for pattern in self.TEST_PATTERNS:
141-
if re.search(pattern, file_path_lower):
142-
return True
143-
return False
127+
"""Check if file is a test file (uses shared utility)"""
128+
return shared_is_test_file(file_path)
144129

145130
def _is_core_file(self, file_path: str) -> bool:
146131
"""Check if file is a core/important file"""

backend/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Utils module

backend/utils/test_detection.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""
2+
Shared test file detection utilities.
3+
Single source of truth for test file patterns across V2/V3 search.
4+
"""
5+
import re
6+
from typing import List
7+
8+
9+
# Regex patterns for test files (consolidated from CodeGraphRanker)
10+
TEST_PATTERNS = [
11+
r'test[s]?[/_]', # test/, tests/, test_
12+
r'[/_]test[s]?\.py$', # _test.py, _tests.py
13+
r'\.test\.[jt]sx?$', # .test.js, .test.ts
14+
r'\.spec\.[jt]sx?$', # .spec.js, .spec.ts
15+
r'__tests__', # __tests__/
16+
r'conftest\.py$', # pytest config
17+
r'fixtures?[/_]', # fixtures/
18+
r'mock[s]?[/_]', # mocks/
19+
]
20+
21+
22+
def is_test_file(file_path: str) -> bool:
23+
"""
24+
Check if file is a test file using regex patterns.
25+
26+
Args:
27+
file_path: Path to check (can be relative or absolute)
28+
29+
Returns:
30+
True if file matches any test pattern
31+
"""
32+
if not file_path:
33+
return False
34+
file_path_lower = file_path.lower()
35+
for pattern in TEST_PATTERNS:
36+
if re.search(pattern, file_path_lower):
37+
return True
38+
return False
39+
40+
41+
def filter_test_files(results: List[dict], include_tests: bool = False) -> List[dict]:
42+
"""
43+
Filter test files from search results.
44+
45+
Args:
46+
results: List of search result dicts with 'file_path' key
47+
include_tests: If True, keep test files; if False, filter them out
48+
49+
Returns:
50+
Filtered results list
51+
"""
52+
if include_tests:
53+
return results
54+
return [r for r in results if not is_test_file(r.get("file_path", ""))]
55+
56+
57+
def has_test_file_in_top_n(results: List[dict], n: int = 3) -> bool:
58+
"""
59+
Check if any of the top N results are test files.
60+
Useful for benchmarking test pollution.
61+
62+
Args:
63+
results: List of search result dicts
64+
n: Number of top results to check
65+
66+
Returns:
67+
True if any top N result is a test file
68+
"""
69+
for r in results[:n]:
70+
if is_test_file(r.get("file_path", "")):
71+
return True
72+
return False

0 commit comments

Comments
 (0)