Skip to content

Commit ccafa4a

Browse files
committed
fix: style_analyzer and dna_extractor respect include_paths for subset indexing
When users select specific directories via include_paths (e.g. 2 packages out of 1767 files), the Overview and Code Style pages showed counts for the full repo instead of the indexed subset. Root cause: dependency_analyzer respected include_paths but style_analyzer and dna_extractor did not. Changes: - style_analyzer.analyze_repository_style() now accepts include_paths - dna_extractor.extract_dna() now accepts include_paths - dna_extractor._discover_files() now accepts include_paths - analysis.py routes pass repo.get('include_paths') to both analyzers - include_paths sanitization (corrupt jsonb guard) in both services - Path.parts filtering matches dependency_analyzer pattern (PR #280) Closes OPE-119
1 parent 09513de commit ccafa4a

3 files changed

Lines changed: 69 additions & 19 deletions

File tree

backend/routes/analysis.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ async def get_style_analysis(
134134
return {**cached_style, "cached": True}
135135

136136
logger.info("Analyzing code style", repo_id=repo_id)
137-
style_data = style_analyzer.analyze_repository_style(repo["local_path"])
137+
style_data = style_analyzer.analyze_repository_style(repo["local_path"], include_paths=repo.get("include_paths"))
138138
style_analyzer.save_to_cache(repo_id, style_data)
139139

140140
return {**style_data, "cached": False}
@@ -178,7 +178,7 @@ async def get_codebase_dna(
178178
logger.info("Extracting codebase DNA", repo_id=repo_id)
179179
metrics.increment("dna_extractions")
180180

181-
dna = dna_extractor.extract_dna(repo["local_path"], repo_id)
181+
dna = dna_extractor.extract_dna(repo["local_path"], repo_id, include_paths=repo.get("include_paths"))
182182
dna_extractor.save_to_cache(repo_id, dna)
183183

184184
if format == "markdown":

backend/services/dna_extractor.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -361,21 +361,33 @@ def _detect_language(self, file_path: str) -> str:
361361
'.tsx': 'typescript',
362362
}.get(ext, 'unknown')
363363

364-
def _discover_files(self, repo_path: Path) -> List[Path]:
365-
"""Find all code files, skipping irrelevant directories and symlinks"""
364+
def _discover_files(self, repo_path: Path, include_paths: Optional[List[str]] = None) -> List[Path]:
365+
"""Find all code files, skipping irrelevant directories and symlinks.
366+
367+
If include_paths is set, only files within those directories are returned.
368+
"""
366369
files = []
367370
extensions = {'.py', '.js', '.jsx', '.ts', '.tsx', '.sql'}
368371

369372
try:
370373
for item in repo_path.rglob('*'):
371374
if item.is_symlink():
372375
continue
373-
if item.is_file() and item.suffix in extensions:
374-
if not any(skip in item.parts for skip in self.SKIP_DIRS):
375-
files.append(item)
376-
if len(files) >= self.MAX_FILES:
377-
logger.warning(f"Hit max file limit ({self.MAX_FILES})")
378-
break
376+
if not item.is_file() or item.suffix not in extensions:
377+
continue
378+
if any(skip in item.parts for skip in self.SKIP_DIRS):
379+
continue
380+
if include_paths:
381+
rel_parts = item.relative_to(repo_path).parts
382+
if not any(
383+
rel_parts[:len(Path(p).parts)] == Path(p).parts
384+
for p in include_paths
385+
):
386+
continue
387+
files.append(item)
388+
if len(files) >= self.MAX_FILES:
389+
logger.warning(f"Hit max file limit ({self.MAX_FILES})")
390+
break
379391
except Exception as e:
380392
logger.error(f"Error discovering files: {e}")
381393

@@ -914,8 +926,11 @@ def _extract_config_patterns(self, files: List[Path], repo_path: Path) -> Config
914926

915927
return pattern
916928

917-
def extract_dna(self, repo_path: str, repo_id: str) -> CodebaseDNA:
918-
"""Extract complete DNA profile from a codebase"""
929+
def extract_dna(self, repo_path: str, repo_id: str, include_paths: Optional[List[str]] = None) -> CodebaseDNA:
930+
"""Extract complete DNA profile from a codebase.
931+
932+
If include_paths is set, only files within those directories are analyzed.
933+
"""
919934
import time
920935
start_time = time.time()
921936

@@ -929,13 +944,25 @@ def extract_dna(self, repo_path: str, repo_id: str) -> CodebaseDNA:
929944
logger.error(f"Repo path is not a directory: {repo_path}")
930945
raise ValueError(f"Repository path is not a directory: {repo_path}")
931946

947+
# Sanitize include_paths (could be corrupt jsonb from DB)
948+
if include_paths:
949+
cleaned = []
950+
for p in include_paths:
951+
if not isinstance(p, str):
952+
continue
953+
p = p.replace('\\', '/').strip().strip('/')
954+
if not p or '..' in p.split('/'):
955+
continue
956+
cleaned.append(p)
957+
include_paths = cleaned or None
958+
932959
# reset cache for fresh extraction
933960
self._reset_cache()
934961

935-
logger.info("Extracting codebase DNA", repo_id=repo_id, path=str(repo_path))
962+
logger.info("Extracting codebase DNA", repo_id=repo_id, path=str(repo_path), include_paths=include_paths)
936963

937964
# Discover files
938-
files = self._discover_files(repo_path)
965+
files = self._discover_files(repo_path, include_paths=include_paths)
939966
logger.info(f"Found {len(files)} code files")
940967

941968
# Detect framework first

backend/services/style_analyzer.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,26 @@ def _check_type_hints(self, source_code: str, language: str) -> bool:
139139
else:
140140
return ': ' in source_code and ('interface' in source_code or 'type ' in source_code)
141141

142-
def analyze_repository_style(self, repo_path: str) -> Dict:
143-
"""Analyze coding style patterns across repository"""
142+
def analyze_repository_style(self, repo_path: str, include_paths: Optional[List[str]] = None) -> Dict:
143+
"""Analyze coding style patterns across repository.
144+
145+
If include_paths is set, only files within those directories are analyzed.
146+
"""
144147
repo_path = Path(repo_path)
145148

146-
logger.info("Analyzing code style for repository")
149+
# Sanitize include_paths (could be corrupt jsonb from DB)
150+
if include_paths:
151+
cleaned = []
152+
for p in include_paths:
153+
if not isinstance(p, str):
154+
continue
155+
p = p.replace('\\', '/').strip().strip('/')
156+
if not p or '..' in p.split('/'):
157+
continue
158+
cleaned.append(p)
159+
include_paths = cleaned or None
160+
161+
logger.info("Analyzing code style for repository", include_paths=include_paths)
147162

148163
# Discover code files
149164
code_files = []
@@ -155,8 +170,16 @@ def analyze_repository_style(self, repo_path: str) -> Dict:
155170
continue
156171
if any(skip in file_path.parts for skip in skip_dirs):
157172
continue
158-
if file_path.suffix in extensions:
159-
code_files.append(file_path)
173+
if file_path.suffix not in extensions:
174+
continue
175+
if include_paths:
176+
rel_parts = file_path.relative_to(repo_path).parts
177+
if not any(
178+
rel_parts[:len(Path(p).parts)] == Path(p).parts
179+
for p in include_paths
180+
):
181+
continue
182+
code_files.append(file_path)
160183

161184
# Collect style data
162185
function_names = []

0 commit comments

Comments
 (0)