From ccafa4a470516eef27d8f045605938d8f0fb6f49 Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Wed, 11 Mar 2026 00:19:28 -0400 Subject: [PATCH] fix: style_analyzer and dna_extractor respect include_paths for subset indexing When users select specific directories via include_paths (e.g. 2 packages out of 1767 files), the Overview and Code Style pages showed counts for the full repo instead of the indexed subset. Root cause: dependency_analyzer respected include_paths but style_analyzer and dna_extractor did not. Changes: - style_analyzer.analyze_repository_style() now accepts include_paths - dna_extractor.extract_dna() now accepts include_paths - dna_extractor._discover_files() now accepts include_paths - analysis.py routes pass repo.get('include_paths') to both analyzers - include_paths sanitization (corrupt jsonb guard) in both services - Path.parts filtering matches dependency_analyzer pattern (PR #280) Closes OPE-119 --- backend/routes/analysis.py | 4 +-- backend/services/dna_extractor.py | 51 +++++++++++++++++++++++------- backend/services/style_analyzer.py | 33 ++++++++++++++++--- 3 files changed, 69 insertions(+), 19 deletions(-) diff --git a/backend/routes/analysis.py b/backend/routes/analysis.py index bc59ea0..2eb4daa 100644 --- a/backend/routes/analysis.py +++ b/backend/routes/analysis.py @@ -134,7 +134,7 @@ async def get_style_analysis( return {**cached_style, "cached": True} logger.info("Analyzing code style", repo_id=repo_id) - style_data = style_analyzer.analyze_repository_style(repo["local_path"]) + style_data = style_analyzer.analyze_repository_style(repo["local_path"], include_paths=repo.get("include_paths")) style_analyzer.save_to_cache(repo_id, style_data) return {**style_data, "cached": False} @@ -178,7 +178,7 @@ async def get_codebase_dna( logger.info("Extracting codebase DNA", repo_id=repo_id) metrics.increment("dna_extractions") - dna = dna_extractor.extract_dna(repo["local_path"], repo_id) + dna = dna_extractor.extract_dna(repo["local_path"], repo_id, include_paths=repo.get("include_paths")) dna_extractor.save_to_cache(repo_id, dna) if format == "markdown": diff --git a/backend/services/dna_extractor.py b/backend/services/dna_extractor.py index c8ff202..524a2ad 100644 --- a/backend/services/dna_extractor.py +++ b/backend/services/dna_extractor.py @@ -361,8 +361,11 @@ def _detect_language(self, file_path: str) -> str: '.tsx': 'typescript', }.get(ext, 'unknown') - def _discover_files(self, repo_path: Path) -> List[Path]: - """Find all code files, skipping irrelevant directories and symlinks""" + def _discover_files(self, repo_path: Path, include_paths: Optional[List[str]] = None) -> List[Path]: + """Find all code files, skipping irrelevant directories and symlinks. + + If include_paths is set, only files within those directories are returned. + """ files = [] extensions = {'.py', '.js', '.jsx', '.ts', '.tsx', '.sql'} @@ -370,12 +373,21 @@ def _discover_files(self, repo_path: Path) -> List[Path]: for item in repo_path.rglob('*'): if item.is_symlink(): continue - if item.is_file() and item.suffix in extensions: - if not any(skip in item.parts for skip in self.SKIP_DIRS): - files.append(item) - if len(files) >= self.MAX_FILES: - logger.warning(f"Hit max file limit ({self.MAX_FILES})") - break + if not item.is_file() or item.suffix not in extensions: + continue + if any(skip in item.parts for skip in self.SKIP_DIRS): + continue + if include_paths: + rel_parts = item.relative_to(repo_path).parts + if not any( + rel_parts[:len(Path(p).parts)] == Path(p).parts + for p in include_paths + ): + continue + files.append(item) + if len(files) >= self.MAX_FILES: + logger.warning(f"Hit max file limit ({self.MAX_FILES})") + break except Exception as e: logger.error(f"Error discovering files: {e}") @@ -914,8 +926,11 @@ def _extract_config_patterns(self, files: List[Path], repo_path: Path) -> Config return pattern - def extract_dna(self, repo_path: str, repo_id: str) -> CodebaseDNA: - """Extract complete DNA profile from a codebase""" + def extract_dna(self, repo_path: str, repo_id: str, include_paths: Optional[List[str]] = None) -> CodebaseDNA: + """Extract complete DNA profile from a codebase. + + If include_paths is set, only files within those directories are analyzed. + """ import time start_time = time.time() @@ -929,13 +944,25 @@ def extract_dna(self, repo_path: str, repo_id: str) -> CodebaseDNA: logger.error(f"Repo path is not a directory: {repo_path}") raise ValueError(f"Repository path is not a directory: {repo_path}") + # Sanitize include_paths (could be corrupt jsonb from DB) + if include_paths: + cleaned = [] + for p in include_paths: + if not isinstance(p, str): + continue + p = p.replace('\\', '/').strip().strip('/') + if not p or '..' in p.split('/'): + continue + cleaned.append(p) + include_paths = cleaned or None + # reset cache for fresh extraction self._reset_cache() - logger.info("Extracting codebase DNA", repo_id=repo_id, path=str(repo_path)) + logger.info("Extracting codebase DNA", repo_id=repo_id, path=str(repo_path), include_paths=include_paths) # Discover files - files = self._discover_files(repo_path) + files = self._discover_files(repo_path, include_paths=include_paths) logger.info(f"Found {len(files)} code files") # Detect framework first diff --git a/backend/services/style_analyzer.py b/backend/services/style_analyzer.py index d5c60dc..ff6e4cd 100644 --- a/backend/services/style_analyzer.py +++ b/backend/services/style_analyzer.py @@ -139,11 +139,26 @@ def _check_type_hints(self, source_code: str, language: str) -> bool: else: return ': ' in source_code and ('interface' in source_code or 'type ' in source_code) - def analyze_repository_style(self, repo_path: str) -> Dict: - """Analyze coding style patterns across repository""" + def analyze_repository_style(self, repo_path: str, include_paths: Optional[List[str]] = None) -> Dict: + """Analyze coding style patterns across repository. + + If include_paths is set, only files within those directories are analyzed. + """ repo_path = Path(repo_path) - logger.info("Analyzing code style for repository") + # Sanitize include_paths (could be corrupt jsonb from DB) + if include_paths: + cleaned = [] + for p in include_paths: + if not isinstance(p, str): + continue + p = p.replace('\\', '/').strip().strip('/') + if not p or '..' in p.split('/'): + continue + cleaned.append(p) + include_paths = cleaned or None + + logger.info("Analyzing code style for repository", include_paths=include_paths) # Discover code files code_files = [] @@ -155,8 +170,16 @@ def analyze_repository_style(self, repo_path: str) -> Dict: continue if any(skip in file_path.parts for skip in skip_dirs): continue - if file_path.suffix in extensions: - code_files.append(file_path) + if file_path.suffix not in extensions: + continue + if include_paths: + rel_parts = file_path.relative_to(repo_path).parts + if not any( + rel_parts[:len(Path(p).parts)] == Path(p).parts + for p in include_paths + ): + continue + code_files.append(file_path) # Collect style data function_names = []